opentracker/scan_urlencoded_query.c

/* This software was written by Dirk Engling <erdgeist@erdgeist.org>
   It is considered beerware. Prost. Skol. Cheers or whatever.

   $id$ */

/* Opentracker */
#include "scan_urlencoded_query.h"

/* Libwofat */
#include "scan.h"

/* System */
#include <string.h>

/* Idea is to do a in place replacement or guarantee at least
   strlen( string ) bytes in deststring
   watch http://www.ietf.org/rfc/rfc2396.txt
         unreserved    = alphanum | mark
         mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
   we add '%' to the matrix to not stop at encoded chars.
   After losing too many requests to being too strict, add the following characters to reserved matrix
         relax         = "+" | "," | "/" | ";" | "<" | ">" | ":"
*/

/* This matrix holds for each ascii character the information,
   whether it is a non-terminating character for on of the three
   scan states we are in, that is 'path', 'param' and 'value' from
  /path?param=value&param=value, it is encoded in bit 0, 1 and 2
  respectively

  The top bit of lower nibble indicates, whether this character is
  a hard terminator, ie. \0, \n or \s, where the whole scanning
  process should terminate
  */
static const unsigned char is_unreserved[256] = {
  8,0,0,0,0,0,0,0,0,0,8,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  8,7,8,8,8,7,0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,4,7,6,
  4,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,7,
  8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,7,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};

/* Do a fast nibble to hex representation conversion */
static unsigned char fromhex(unsigned char x) {
  x-='0'; if( x<=9) return x;
  x&=~0x20; x-='A'-'0';
  if( x<6 ) return x+10;
  return 0xff;
}

/* Skip the value of a param=value pair */
void scan_urlencoded_skipvalue( char **string ) {
  const unsigned char* s=*(const unsigned char**) string;
  unsigned char f;

  /* Since we are asked to skip the 'value', we assume to stop at
     terminators for a 'value' string position */
  while( ( f = is_unreserved[ *s++ ] ) & SCAN_SEARCHPATH_VALUE );

  /* If we stopped at a hard terminator like \0 or \n, make the
     next scan_urlencoded_query encounter it again */
  if( f & SCAN_SEARCHPATH_TERMINATOR ) --s;

  *string = (char*)s;
}

int scan_find_keywords( const ot_keywords * keywords, char **string, SCAN_SEARCHPATH_FLAG flags) {
  char *deststring = *string;
  ssize_t match_length = scan_urlencoded_query(string, deststring, flags );

  if( match_length < 0 ) return match_length;
  if( match_length == 0 ) return -3;

  while( keywords->key ) {
    if( !strncmp( keywords->key, deststring, match_length ) && !keywords->key[match_length] )
      return keywords->value;
    keywords++;
  }

  return -3;
}

ssize_t scan_urlencoded_query(char **string, char *deststring, SCAN_SEARCHPATH_FLAG flags) {
  const unsigned char* s=*(const unsigned char**) string;
  unsigned char *d = (unsigned char*)deststring;
  unsigned char b, c;

  /* This is the main decoding loop.
    'flag' determines, which characters are non-terminating in current context
    (ie. stop at '=' and '&' if scanning for a 'param'; stop at '?' if scanning for the path )
  */
  while( is_unreserved[ c = *s++ ] & flags ) {

    /* When encountering an url escaped character, try to decode */
    if( c=='%') {
      if( ( b = fromhex(*s++) ) == 0xff ) return -1;
      if( ( c = fromhex(*s++) ) == 0xff ) return -1;
      c|=(b<<4);
    }

    /* Write (possibly decoded) character to output */
    *d++ = c;
  }

  switch( c ) {
  case 0: case '\r': case '\n': case ' ':
    /* If we started scanning on a hard terminator, indicate we've finished */
    if( d == (unsigned char*)deststring ) return -2;

    /* Else make the next call to scan_urlencoded_param encounter it again */
    --s;
    break;
  case '?':
    if( flags != SCAN_PATH ) return -1;
    break;
  case '=':
    if( flags != SCAN_SEARCHPATH_PARAM ) return -1;
    break;
  case '&':
    if( flags == SCAN_PATH ) return -1;
    if( flags == SCAN_SEARCHPATH_PARAM ) --s;
    break;
  default:
    return -1;
  }

  *string = (char *)s;
  return d - (unsigned char*)deststring;
}

ssize_t scan_fixed_int( char *data, size_t len, int *tmp ) {
  int minus = 0;
  *tmp = 0;
  if( *data == '-' ) --len, ++data, ++minus;
  while( (len > 0) && (*data >= '0') && (*data <= '9') ) { --len; *tmp = 10**tmp + *data++-'0'; }
  if( minus ) *tmp = -*tmp;
  return len;
}

const char *g_version_scan_urlencoded_query_c = "$Source$: $Revision$\n";
account downloaded before early returns 18 years ago			`/* This software was written by Dirk Engling <erdgeist@erdgeist.org>`
Introduce some kind of versioning 17 years ago			`It is considered beerware. Prost. Skol. Cheers or whatever.`
Whitespace fixes 16 years ago
Introduce some kind of versioning 17 years ago			`$id$ */`
README\! 18 years ago
Tidy up unnecessary includes 16 years ago			`/* Opentracker */`
Some syntax errors removed 18 years ago			`#include "scan_urlencoded_query.h"`
Our scanner routine for the URI query string 18 years ago
Tidy up unnecessary includes 16 years ago			`/* Libwofat */`
			`#include "scan.h"`

* http and udp routines now use thread local buffers passed in workstruct containers. In other words they do not use static_buffer anymore and are considered to be thread safe. * the new workstruct also introduces a well defined buffer and result passing path * a new function scan_find_keywords is a wrapper around scan_urlencoded_query that maps keys in url to values passed in an array of ot_keywords structs * this new function cleans up much of url parameter parsing work, where read_ptr and write_ptr have been introduced rather than the confusing char c, data variables * I now use memcmp instead of byte_diff to allow compiler to optimize constant size string compares * got rid of UTORRENT_1600_WORKAROUND * livesync_ticker is now only called from one (currently main) thread to avoid race conditions 16 years ago			`/* System */`
			`#include <string.h>`

Documentation improved, some reindenting (again), variable types checked, unnecessary defines removed 18 years ago			`/* Idea is to do a in place replacement or guarantee at least`
			`strlen( string ) bytes in deststring`
			`watch http://www.ietf.org/rfc/rfc2396.txt`
			`unreserved = alphanum \| mark`
			`mark = "-" \| "_" \| "." \| "!" \| "~" \| "*" \| "'" \| "(" \| ")"`
			`we add '%' to the matrix to not stop at encoded chars.`
Allowing more relaxed parsing of queries 18 years ago			`After losing too many requests to being too strict, add the following characters to reserved matrix`
Some clients even send plain : in their requests 18 years ago			`relax = "+" \| "," \| "/" \| ";" \| "<" \| ">" \| ":"`
Documentation improved, some reindenting (again), variable types checked, unnecessary defines removed 18 years ago			`*/`
Allowing more relaxed parsing of queries 18 years ago
Add documentation to our uri scanner 17 years ago			`/* This matrix holds for each ascii character the information,`
			`whether it is a non-terminating character for on of the three`
			`scan states we are in, that is 'path', 'param' and 'value' from`
			`/path?param=value&param=value, it is encoded in bit 0, 1 and 2`
			`respectively`

			`The top bit of lower nibble indicates, whether this character is`
			`a hard terminator, ie. \0, \n or \s, where the whole scanning`
			`process should terminate`
			`*/`
fixed one performance bug, where "skipping values from a &param=values pair" was requested, the requestor ended up with "values" to be parsed again. improved performance of fromhex improved performance of is_unreserved() by moving it all into a simple byte array improved performance of %41 => 'A' conversion by reordering variables 17 years ago			`static const unsigned char is_unreserved[256] = {`
Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`8,0,0,0,0,0,0,0,0,0,8,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
fix parser 16 years ago			`8,7,8,8,8,7,0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,4,7,6,`
Be really relaxed now about what to accept in values... 17 years ago			`4,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,7,`
			`8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,7,0,`
fixed one performance bug, where "skipping values from a &param=values pair" was requested, the requestor ended up with "values" to be parsed again. improved performance of fromhex improved performance of is_unreserved() by moving it all into a simple byte array improved performance of %41 => 'A' conversion by reordering variables 17 years ago			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0`
			`};`

Add documentation to our uri scanner 17 years ago			`/* Do a fast nibble to hex representation conversion */`
Make fromhex() even faster 17 years ago			`static unsigned char fromhex(unsigned char x) {`
			`x-='0'; if( x<=9) return x;`
			`x&=~0x20; x-='A'-'0';`
			`if( x<6 ) return x+10;`
fixed one performance bug, where "skipping values from a &param=values pair" was requested, the requestor ended up with "values" to be parsed again. improved performance of fromhex improved performance of is_unreserved() by moving it all into a simple byte array improved performance of %41 => 'A' conversion by reordering variables 17 years ago			`return 0xff;`
Our scanner routine for the URI query string 18 years ago			`}`

Add documentation to our uri scanner 17 years ago			`/* Skip the value of a param=value pair */`
Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`void scan_urlencoded_skipvalue( char **string ) {`
			`const unsigned char* s=(const unsigned char*) string;`
			`unsigned char f;`

Add documentation to our uri scanner 17 years ago			`/* Since we are asked to skip the 'value', we assume to stop at`
			`terminators for a 'value' string position */`
Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`while( ( f = is_unreserved[ *s++ ] ) & SCAN_SEARCHPATH_VALUE );`
Add documentation to our uri scanner 17 years ago
			`/* If we stopped at a hard terminator like \0 or \n, make the`
			`next scan_urlencoded_query encounter it again */`
Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`if( f & SCAN_SEARCHPATH_TERMINATOR ) --s;`
Add documentation to our uri scanner 17 years ago
Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`string = (char)s;`
			`}`

* http and udp routines now use thread local buffers passed in workstruct containers. In other words they do not use static_buffer anymore and are considered to be thread safe. * the new workstruct also introduces a well defined buffer and result passing path * a new function scan_find_keywords is a wrapper around scan_urlencoded_query that maps keys in url to values passed in an array of ot_keywords structs * this new function cleans up much of url parameter parsing work, where read_ptr and write_ptr have been introduced rather than the confusing char c, data variables * I now use memcmp instead of byte_diff to allow compiler to optimize constant size string compares * got rid of UTORRENT_1600_WORKAROUND * livesync_ticker is now only called from one (currently main) thread to avoid race conditions 16 years ago			`int scan_find_keywords( const ot_keywords * keywords, char **string, SCAN_SEARCHPATH_FLAG flags) {`
			`char deststring = string;`
			`ssize_t match_length = scan_urlencoded_query(string, deststring, flags );`

			`if( match_length < 0 ) return match_length;`
			`if( match_length == 0 ) return -3;`

			`while( keywords->key ) {`
Lot of Gehacktes around this keyword scanner. Don't return a match if string to compare is shorter than key from table. 16 years ago			`if( !strncmp( keywords->key, deststring, match_length ) && !keywords->key[match_length] )`
* http and udp routines now use thread local buffers passed in workstruct containers. In other words they do not use static_buffer anymore and are considered to be thread safe. * the new workstruct also introduces a well defined buffer and result passing path * a new function scan_find_keywords is a wrapper around scan_urlencoded_query that maps keys in url to values passed in an array of ot_keywords structs * this new function cleans up much of url parameter parsing work, where read_ptr and write_ptr have been introduced rather than the confusing char c, data variables * I now use memcmp instead of byte_diff to allow compiler to optimize constant size string compares * got rid of UTORRENT_1600_WORKAROUND * livesync_ticker is now only called from one (currently main) thread to avoid race conditions 16 years ago			`return keywords->value;`
			`keywords++;`
			`}`

			`return -3;`
			`}`

Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`ssize_t scan_urlencoded_query(char *string, char deststring, SCAN_SEARCHPATH_FLAG flags) {`
fixed one performance bug, where "skipping values from a &param=values pair" was requested, the requestor ended up with "values" to be parsed again. improved performance of fromhex improved performance of is_unreserved() by moving it all into a simple byte array improved performance of %41 => 'A' conversion by reordering variables 17 years ago			`const unsigned char* s=(const unsigned char*) string;`
Some syntax errors removed 18 years ago			`unsigned char d = (unsigned char)deststring;`
added live sync code added a config file parser added tracker id changed WANT_CLOSED_TRACKER and WANT_BLACKLIST into WANT_ACCESS_WHITE and WANT_ACCESS_BLACK changed WANT_TRACKER_SYNC to WANT_SYNC_BATCH and added WANT_SYNC_LIVE added an option to switch off fullscrapes cleaned up many internal hardcoded values, like PROTO_FLAG, 16 years ago			`unsigned char b, c;`
Our scanner routine for the URI query string 18 years ago
Add documentation to our uri scanner 17 years ago			`/* This is the main decoding loop.`
			`'flag' determines, which characters are non-terminating in current context`
			`(ie. stop at '=' and '&' if scanning for a 'param'; stop at '?' if scanning for the path )`
			`*/`
added live sync code added a config file parser added tracker id changed WANT_CLOSED_TRACKER and WANT_BLACKLIST into WANT_ACCESS_WHITE and WANT_ACCESS_BLACK changed WANT_TRACKER_SYNC to WANT_SYNC_BATCH and added WANT_SYNC_LIVE added an option to switch off fullscrapes cleaned up many internal hardcoded values, like PROTO_FLAG, 16 years ago			`while( is_unreserved[ c = *s++ ] & flags ) {`
Add documentation to our uri scanner 17 years ago
			`/* When encountering an url escaped character, try to decode */`
Added option to get ip from query string + parser, fixed two bugs concerning grow/shrink of vectors. Now cleans up a torrent BEFORE trying to remove a peer -> this may remove peer already and must be done anyway. 18 years ago			`if( c=='%') {`
fixed one performance bug, where "skipping values from a &param=values pair" was requested, the requestor ended up with "values" to be parsed again. improved performance of fromhex improved performance of is_unreserved() by moving it all into a simple byte array improved performance of %41 => 'A' conversion by reordering variables 17 years ago			`if( ( b = fromhex(*s++) ) == 0xff ) return -1;`
			`if( ( c = fromhex(*s++) ) == 0xff ) return -1;`
			`c\|=(b<<4);`
Our scanner routine for the URI query string 18 years ago			`}`
Add documentation to our uri scanner 17 years ago
			`/* Write (possibly decoded) character to output */`
Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`*d++ = c;`
Our scanner routine for the URI query string 18 years ago			`}`

			`switch( c ) {`
			`case 0: case '\r': case '\n': case ' ':`
Add documentation to our uri scanner 17 years ago			`/* If we started scanning on a hard terminator, indicate we've finished */`
Save a lot of work when skipping through uninteresting http request parameters 17 years ago			`if( d == (unsigned char*)deststring ) return -2;`
Add documentation to our uri scanner 17 years ago
			`/* Else make the next call to scan_urlencoded_param encounter it again */`
Fixed parser 18 years ago			`--s;`
Our scanner routine for the URI query string 18 years ago			`break;`
			`case '?':`
* http and udp routines now use thread local buffers passed in workstruct containers. In other words they do not use static_buffer anymore and are considered to be thread safe. * the new workstruct also introduces a well defined buffer and result passing path * a new function scan_find_keywords is a wrapper around scan_urlencoded_query that maps keys in url to values passed in an array of ot_keywords structs * this new function cleans up much of url parameter parsing work, where read_ptr and write_ptr have been introduced rather than the confusing char c, data variables * I now use memcmp instead of byte_diff to allow compiler to optimize constant size string compares * got rid of UTORRENT_1600_WORKAROUND * livesync_ticker is now only called from one (currently main) thread to avoid race conditions 16 years ago			`if( flags != SCAN_PATH ) return -1;`
Our scanner routine for the URI query string 18 years ago			`break;`
			`case '=':`
Fixed parser 18 years ago			`if( flags != SCAN_SEARCHPATH_PARAM ) return -1;`
Our scanner routine for the URI query string 18 years ago			`break;`
			`case '&':`
Fixed parser 18 years ago			`if( flags == SCAN_PATH ) return -1;`
			`if( flags == SCAN_SEARCHPATH_PARAM ) --s;`
Our scanner routine for the URI query string 18 years ago			`break;`
			`default:`
			`return -1;`
			`}`

Some syntax errors removed 18 years ago			`string = (char )s;`
			`return d - (unsigned char*)deststring;`
Our scanner routine for the URI query string 18 years ago			`}`
Now actually seems to work for the most parts Added scraping Added graceful disconnect 18 years ago
Use signed size_t wherever appropriate 18 years ago			`ssize_t scan_fixed_int( char data, size_t len, int tmp ) {`
some clients chose to accidently send negative numwants 17 years ago			`int minus = 0;`
Now actually seems to work for the most parts Added scraping Added graceful disconnect 18 years ago			`*tmp = 0;`
some clients chose to accidently send negative numwants 17 years ago			`if( *data == '-' ) --len, ++data, ++minus;`
Now actually seems to work for the most parts Added scraping Added graceful disconnect 18 years ago			`while( (len > 0) && (data >= '0') && (data <= '9') ) { --len; tmp = 10tmp + data++-'0'; }`
some clients chose to accidently send negative numwants 17 years ago			`if( minus ) tmp = -tmp;`
Now actually seems to work for the most parts Added scraping Added graceful disconnect 18 years ago			`return len;`
			`}`
Added option to get ip from query string + parser, fixed two bugs concerning grow/shrink of vectors. Now cleans up a torrent BEFORE trying to remove a peer -> this may remove peer already and must be done anyway. 18 years ago
Introduce some kind of versioning 17 years ago			`const char *g_version_scan_urlencoded_query_c = "$Source$: $Revision$\n";`