⚠️ Warning: This is a draft ⚠️

This means it might contain formatting issues, incorrect code, conceptual problems, or other severe issues.

If you want to help to improve and eventually enable this page, please fork RosettaGit's repository and open a merge request on GitHub.

==ALGOL 68 URI Parser== This is a URI parser implemented in Algol 68. The text can be cut-and-paste into an Algol 68 program or saved in a file and included in another program by using the read pragma available in Algol 68, e.g.:

PR read "uriParser.a68" PR
URI u := parse uri( "fred://[email protected]" );
...

ALGOL 68

# URI parser #

# MODE returned by the URI parser #
MODE URI = STRUCT( STRING  scheme
                 , STRING  userinfo
                 , STRING  host
                 , STRING  port
                 , STRING  path         
                 , STRING  query 
                 , STRING  fragment id
                 , BOOL    ok     # TRUE if the URI parse was OK #
                 , STRING  error  # error message if the parse failed #
                 );

# returns the URI parsed from text                                    #
# ok OF the result will be TRUE  if the parse was successful          #
# ok OF the result will be FALSE if the parse failed                  #
#       and error OF the result will be a suitable error message      #
# the authority is split into the userinfo, host and port fields      #
# and not returned as a separate combined field                       #
PROC parse uri = ( STRING text )URI:
     BEGIN
         INT    pos         := 0;    # current character position     #
         INT    end pos     := 0;    # last character position        #

         STRING alphas       = "abcdefghijklmnopqrstuvwxyz"
                             + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
                             ;
         STRING digits       = "0123456789";
         STRING sub delims   = "!$&'()*+,;=";
         STRING unreserved   = alphas + digits + "-._~";
         STRING hex digits   = digits + "abcdefABCDEF";


         # sets the error message of the result and indicates the parse failed #
         PROC error = ( STRING message )VOID:
              BEGIN
                  ok    OF result := FALSE;
                  error OF result := message + " (near position " + whole( pos, 0 ) + ")"
              END # error # ;

         # returns TRUE if we have passed the end of text, FALSE otherwise #
         PROC at end = BOOL: pos > end pos;

         # returns the current character from the string #
         #      or REPR 0 if we have passed the end of the string #
         PROC curr char = CHAR: IF at end THEN REPR 0 ELSE text[ pos ] FI;

         # returns the character n positions after the current one or REPR 0 if there isn't one #
         PROC peek = ( INT n )CHAR: IF pos + n > end pos THEN REPR 0 ELSE text[ pos + n ] FI;

         # returns TRUE if the current character is ch, FALSE otherwise #
         PROC have = ( CHAR ch )BOOL: curr char = ch;

         # returns TRUE if the current character is one of the specified characters, FALSE otherwise #
         PROC have one of = ( STRING characters )BOOL: char in string( curr char, NIL, characters );

         # returns TRUE if ch is a letter (a-z, A-Z only), FALSE otherwise #
         PROC is letter = ( CHAR ch )BOOL: char in string( ch, NIL, alphas );

         # returns TRUE if ch is a hex digit, FALSE otherwise #
         PROC is hex    = ( CHAR ch )BOOL: char in string( ch, NIL, hex digits );

         # positions to the next character, if there is one #
         PROC next char = VOID: IF at end THEN pos := end pos + 1 ELSE pos +:= 1 FI;

         # returns and skips over the sequence of chatracters matching the specified characters #
         #         or hex encoded characters ( "%" followed by 2 hex digits )                   #
         PROC possibly encoded seq = ( STRING characters )STRING:
              BEGIN
                  STRING result := "";
                  BOOL   ok     := TRUE;
                  WHILE CHAR ch := curr char;
                        ok AND ( have one of( characters ) OR ch = "%" )
                  DO
                      IF ch = "%"
                      THEN
                          # should be "%" followed by a hex digit and a hex digit           #
                          IF NOT is hex( peek( 1 ) ) OR NOT is hex( peek( 2 ) )
                          THEN
                              # invalid encoded character                                   #
                              error( "Invalid encoded character" );
                              ok := FALSE
                          ELSE
                              # encoding looks OK #
                              result +:= curr char;
                              next char;
                              result +:= curr char;
                              next char;
                              result +:= curr char;
                              next char
                          FI
                      ELSE
                          # single character element                                        #
                          result +:= curr char;
                          next char
                      FI
                  OD;
                  result
              END # possibly encoded seq # ;

         # returns and skips over the sequence of the specified characters starting         #
         #         at the current position, if there is one                                 #
         PROC seq = ( STRING characters )STRING:
              BEGIN
                  STRING result := "";
                  WHILE have one of( characters )
                  DO
                      result +:= curr char;
                      next char
                  OD;
                  result
              END # seq # ;

         # returns and skips over the sequence of the specified characters starting         #
         #         at the current position                                                  #
         # if the sequence is empty, the specified error message is issued                  #
         PROC seq 1 = ( STRING characters, error message )STRING:
              BEGIN
                  STRING result := seq( characters );
                  IF result = ""
                  THEN
                      # empty sequence                                                      #
                      error( "Expected at least one of: """ + characters + """ for " + error message )
                  FI;
                  result
              END # seq 1 # ;

         # checks the current character is ch and advances over it if it is                 #
         # if the current character is not ch, an error is indicated                        #
         PROC must be = ( CHAR ch, STRING message )VOID: IF have( ch ) THEN next char ELSE error( message ) FI;

         # checks we have reached the end of the text and sets an error if we haven't       #
         PROC must be at end = ( STRING message )VOID: IF NOT at end THEN error( message ) FI;

         # returns and skips over an IPV6 address - the address format is not validated     #
         PROC ipv6 address = STRING: seq( hex digits + ":" ) + seq( digits + "." );

         # ------------                                                                     #
         # main parsing                                                                     #
         # 
### ======
                                                                     #

         URI result := ( "", "", "", "", "", "", "", TRUE, "" );

         # initialise parsing #
         pos     := LWB text;
         end pos := UPB text;

         # get the scheme                                                                   #
         IF ok OF result
         THEN
             scheme OF result := seq 1( alphas + digits + "+-.", "URI scheme" );
             IF ok OF result
             THEN
                 # the scheme must start with a letter                                      #
                 IF NOT is letter( ( scheme OF result )[ 1 ] )
                 THEN
                     # scheme didn't start with a-z, A-Z                                    #
                     error( "URI scheme must start with a letter (a-z, A-Z only)" )
                 ELSE
                     # ok so far, there should be a ":" next                                #
                     must be( ":", "after the URI scheme" )
                 FI
             FI
         FI;

         # get the path #
         IF ok OF result
         THEN
             # got the scheme OK, get the path #
             IF curr char = "/" AND peek( 1 ) = "/"
             THEN
                 # URI has an authority                                                     #
                 # there will optionally be userinfo followed by @                          #
                 # if there is no "@", the element will be the host                         #
                 next char;
                 next char;
                 # remember the start positioin of the element, incase we need to backtrack #
                 INT start pos := pos;
                 userinfo OF result := possibly encoded seq( unreserved + sub delims + ":" );
                 IF ok OF result
                 THEN
                     # got an element OK #
                     IF have( "@" )
                     THEN
                         # there was an "@", so the element we just parsed was the user info #
                         next char
                     ELSE
                         # didn't get any user info, backtrack to parse the text as the host #
                         userinfo OF result := "";
                         pos := start pos
                     FI
                 FI;
                 # we should now have the host optionally followed by ":" and the port       #
                 IF have( "[" )
                 THEN
                     # host is an IP literal #
                     next char;
                     host OF result := ipv6 address;
                     must be( "]", "following IPV6 address in URI host" )
                 ELSE
                     # host is a reg-name or IPV4 address #
                     # note an IPV4 address matches the reg-name pattern and we do not       #
                     # distinguish between them                                              #
                     host OF result := possibly encoded seq( unreserved + sub delims )
                 FI;
                 # can now have a port - ":" followed by digits                              #
                 IF have( ":" )
                 THEN
                     # have a port #
                     next char;
                     port OF result := seq( digits );
                     # the port can only be followed by "/", "?" or a hash character         #
                     # as the authority must be followed by a path-abempty                   #
                     # and that is followed by optional query and optional fragment id       #
                     IF NOT have one of( "/?#" ) AND NOT at end
                     THEN
                         # the port is invalid or followed by extraneous characters          #
                         error( "Invalid URI port" )
                     FI
                 FI
             FI;
             # get the path                                                                  #
             # we expect a possibly empty sequence of segments separated by "/"              #
             # a segment is a possibly empty sequence of                                     #
             #     unreserved, sub-delims, %xx characters, ":" or "@"                        #
             # the RFC categorises paths as:                                                 #
             #     path-abempty    - begins with "/" or is empty                             #
             #     path-absolute   - begins with "/" but not "//"                            #
             #     path-noscheme   - no leading  "/" and no ":" in the first segment         #
             #     path-rootless   - no leading  "/" can have ":" in the first segment       #
             #     path-empty      - empty path                                              #
             # we do not attempt to distinguish between them                                 #
             WHILE path OF result +:= possibly encoded seq( unreserved + sub delims + ":@" );
                   have( "/" ) AND ok OF result
             DO
                 path OF result +:= "/";
                 next char
             OD
         FI;

         # get the query                                                                     #
         IF have( "?" ) AND ok OF result
         THEN
             # have a query                                                                  #
             next char;
             query OF result := possibly encoded seq( unreserved + sub delims + ":@/?" )
         FI;

         # get the fragment id, if there is one                                              #
         IF have( "#" ) AND ok OF result
         THEN
             # have a fragment id                                                            #
             next char;
             fragment id OF result := possibly encoded seq( unreserved + sub delims + "/?" )
         FI;

         # should have reached the end of the text                                           #
         IF ok OF result
         THEN
             # haven't reached the end of the text                                           #
             must be at end( "unexpected text at the end of the URI: " + text[ pos : ] )
         FI;

         result
     END # parse uri # ;