diff --git a/ChangeLog b/ChangeLog index a73a38572..561e16f7e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +Wed Jul 23 16:32:32 EEST 2008 (edwin) +------------------------------------ + * libclamav: performance improvements for URL matching (bb #725, bb #650): + * use a suffix AC-trie and a shift-or FSM to filter + * rewrite the URL regex in C + * use a perfect hash to lookup TLD and ccTLD, instead of a regex + * TODO: suffixes having a common prefix: loop over all of them + cli_ac_free: multiple virname pointing to same location + Mon Jul 21 12:16:44 CEST 2008 (tk) ---------------------------------- * sigtool/vba.c: fix crash on error in vba code (bb#1106) diff --git a/contrib/entitynorm/Makefile b/contrib/entitynorm/Makefile index 7ff2f0821..f4e619f00 100644 --- a/contrib/entitynorm/Makefile +++ b/contrib/entitynorm/Makefile @@ -1,7 +1,7 @@ PERL=perl CC=cc -all: entitylist.h encoding_aliases.h gentbl encname_chars.h +all: entitylist.h encoding_aliases.h gentbl encname_chars.h generate_hash entities_parsed: entities entities/* entity_decl_parse.pl $(PERL) entity_decl_parse.pl $$@ @@ -9,6 +9,9 @@ entities_parsed: entities entities/* entity_decl_parse.pl generate_entitylist: generate_entitylist.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c $(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@ +generate_hash: generate_hash.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c + $(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@ + generate_encoding_aliases: generate_encoding_aliases.c ../../libclamav/hashtab.c ../../libclamav/others.c ../../libclamav/htmlnorm.h ../../libclamav/entconv.h ../../libclamav/cltypes.h ../../libclamav/hashtab.h ../../libclamav/hashtab.h $(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@ diff --git a/contrib/phishing/update_iana_data.sh b/contrib/phishing/update_iana_data.sh index fa412e90a..7ceec5253 100755 --- a/contrib/phishing/update_iana_data.sh +++ b/contrib/phishing/update_iana_data.sh @@ -26,30 +26,11 @@ OUTFILE=iana_tld.h echo "Downloading updated tld list from iana.org" wget $IANA_TLD -O $TMP || exit 2 echo "Download complete, parsing data" -# 174 is the code for | -TLDLIST=$(egrep -v ^# $TMP | tr \\n \\174 | sed 's/[^a-zA-Z]$//') -echo "Parse complete, removing tmpfile" -rm $TMP -echo "Generating tld list in $OUTFILE" -cat >$OUTFILE <>$OUTFILE -echo -n $TLDLIST >>$OUTFILE -echo ")\"" >>$OUTFILE +grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_tld.h echo "Downloading updated country-code list from iana.org" wget $IANA_CCTLD -O $TMP || exit 2 echo "Download complete, parsing data" -CCTLDLIST=$(cat $TMP | egrep -oi "]+>\\.([a-zA-Z]+).+" | egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr \\n \\174 | sed 's/[^a-zA-Z]$//') -echo "Parse complete, removing tmpfile" -rm $TMP -echo "Generating cctld list in $OUTFILE" -echo -n "#define iana_cctld \"(" >>$OUTFILE -echo -n $CCTLDLIST >>$OUTFILE -echo ")\"" >>$OUTFILE - - -echo "#endif" >>$OUTFILE -echo "Finished succesfully" +cat $TMP | grep country-code|egrep -oi "]+>\\.([a-zA-Z]+).+"|egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr [A-Z] [a-z]| gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set |grep -v '^#line'|sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_cctld.h +echo "Done" diff --git a/contrib/phishing/update_iana_tld.sh b/contrib/phishing/update_iana_tld.sh index 816f9f02c..2bf06aca8 100755 --- a/contrib/phishing/update_iana_tld.sh +++ b/contrib/phishing/update_iana_tld.sh @@ -26,17 +26,4 @@ echo "Downloading updated tld list from iana.org" wget $IANA_TLD -O $TMP || exit 2 echo "Download complete, parsing data" # 174 is the code for | -TLDLIST=$(egrep -v ^# $TMP|tr \\n \\174 ) -echo "Parse complete, removing tmpfile" -rm $TMP -echo "Generating $OUTFILE" -cat >$OUTFILE <>$OUTFILE -echo -n $TLDLIST >>$OUTFILE -echo ")\"" >>$OUTFILE -echo "#endif" >>$OUTFILE -echo "Finished succesfully" - +grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -H tld_hash -N in_tld_set -l|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g' diff --git a/docs/clamdoc.tex b/docs/clamdoc.tex index a654606d7..fcae20bc5 100644 --- a/docs/clamdoc.tex +++ b/docs/clamdoc.tex @@ -361,7 +361,7 @@ All 4 tests passed \item The exact output from \verb+make check+ \item Output of \verb+uname -mrsp+ \item your \verb+config.log+ - \item The following files from the \verb+unit-tests/+ directory: + \item The following files from the \verb+unit_tests/+ directory: \begin{itemize} \item \verb+test.log+ \item \verb+clamscan.log+ diff --git a/libclamav/hashtab.c b/libclamav/hashtab.c index c47183f89..ef7626383 100644 --- a/libclamav/hashtab.c +++ b/libclamav/hashtab.c @@ -367,10 +367,18 @@ void hashtab_clear(struct hashtable *s) if(s->htable[i].key && s->htable[i].key != DELETED_KEY) free((void *)s->htable[i].key); } - memset(s->htable, 0, s->capacity); + if(s->htable) + memset(s->htable, 0, s->capacity); s->used = 0; } +void hashtab_free(struct hashtable *s) +{ + hashtab_clear(s); + free(s->htable); + s->htable = NULL; + s->capacity = 0; +} int hashtab_store(const struct hashtable *s,FILE* out) { diff --git a/libclamav/hashtab.h b/libclamav/hashtab.h index 2d3faa37f..6410a67b3 100644 --- a/libclamav/hashtab.h +++ b/libclamav/hashtab.h @@ -82,7 +82,7 @@ int hashtab_init(struct hashtable *s,size_t capacity); const struct element* hashtab_insert(struct hashtable *s, const char* key, const size_t len, const element_data data); void hashtab_delete(struct hashtable *s,const char* key,const size_t len); void hashtab_clear(struct hashtable *s); - +void hashtab_free(struct hashtable *s); int hashtab_load(FILE* in, struct hashtable *s); int hashtab_store(const struct hashtable *s,FILE* out); diff --git a/libclamav/iana_cctld.h b/libclamav/iana_cctld.h new file mode 100644 index 000000000..6bceb9422 --- /dev/null +++ b/libclamav/iana_cctld.h @@ -0,0 +1,505 @@ +/* ANSI-C code produced by gperf version 3.0.3 */ +/* Command-line: gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set */ +/* Computed positions: -k'1-2' */ + +#ifa' == 97) && ('b' == 98) \ + && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ + && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ + && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ + && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ + && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ + && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ + && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to ." +#endif + +/* maximum key range = 472, duplicates = 0 */ + +#ifdef __GNUC__ +__inline +#else +#ifdef __cplusplus +inline +#endif +#endif +static unsigned int +cctld_hash (const char *str, unsigned int len) +{ + static const unsigned short asso_values[] = + {}; + return len + asso_values[(unsigned char)str[1]] + asso_values[(unsigned char)str[0]+25]; +} + +#ifdef __GNUC__ +__inline +#ifdef __GNUC_STDC_INLINE__ +__attribute__ ((__gnu_inline__)) +#endif +#endif +const char * +in_cctld_set (const char *str, unsigned int len) +{ + enum + { + TOTAL_KEYWORDS = 252, + MIN_WORD_LENGTH = 2, + MAX_WORD_LENGTH = 2, + MIN_HASH_VALUE = 4, + MAX_HASH_VALUE = 475 + }; + + static const unsigned char lengthtable[] = + { + 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, + 2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 2, + 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, + 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, + 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, + 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, + 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, + 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, + 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, + 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, + 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, + 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, + 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, + 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, + 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, + 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, + 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, + 2, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2, + 2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, + 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, + 0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, + 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, + 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, + 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, + 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, + 2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, + 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 + }; + static const char * const wordlist[] = + { + "", "", "", "", + "sv", + "sy", + "se", + "", "", + "mv", + "my", + "me", + "", "", + "bv", + "by", + "be", + "", "", + "cv", + "cy", + "", "", "", + "tv", + "ms", + "", + "sz", + "", + "re", + "bs", + "ae", + "mz", + "", + "ws", + "sc", + "st", + "bz", + "", + "ye", + "mc", + "mt", + "cz", + "rs", + "mq", + "as", + "bt", + "tz", + "", "", + "cc", + "", + "az", + "", "", + "tc", + "tt", + "sm", + "lv", + "ly", + "ac", + "at", + "mm", + "", + "aq", + "", + "mf", + "bm", + "", + "yt", + "", + "bf", + "cm", + "", + "ls", + "wf", + "cf", + "tm", + "", "", + "mw", + "tf", + "am", + "", + "je", + "bw", + "af", + "sr", + "", + "lc", + "lt", + "so", + "mr", + "", "", + "tw", + "mo", + "br", + "rw", + "sb", + "aw", + "bo", + "cr", + "", "", + "sd", + "co", + "tr", + "", + "bb", + "md", + "to", + "ar", + "", + "ro", + "bd", + "ao", + "sg", + "", + "mx", + "cd", + "sa", + "mg", + "de", + "", + "td", + "ma", + "bg", + "", + "cx", + "ad", + "ba", + "cg", + "", "", + "jm", + "ca", + "tg", + "", + "ax", + "", + "lr", + "ag", + "", + "dz", + "sk", + "qa", + "sn", + "", "", + "mk", + "si", + "mn", + "lb", + "", + "gy", + "ge", + "bn", + "", "", + "ck", + "bi", + "cn", + "", "", + "tk", + "ci", + "tn", + "", + "jo", + "gs", + "sj", + "an", + "", + "dm", + "la", + "ai", + "sl", + "", "", "", + "bj", + "ml", + "", "", + "mp", + "gt", + "bl", + "", + "gq", + "", + "tj", + "cl", + "", "", + "py", + "pe", + "tl", + "", + "lk", + "tp", + "", + "al", + "", "", + "li", + "ie", + "gm", + "do", + "", + "ps", + "gf", + "sh", + "", "", + "ee", + "", + "mh", + "", "", + "is", + "ne", + "bh", + "", "", + "gw", + "pt", + "ch", + "", + "es", + "ky", + "ke", + "th", + "", "", "", + "it", + "gr", + "uy", + "iq", + "ve", + "su", + "nz", + "", + "ec", + "et", + "mu", + "pm", + "", + "gb", + "nc", + "pf", + "kz", + "us", + "", + "gd", + "cu", + "im", + "jp", + "ht", + "uz", + "zm", + "dk", + "", + "ru", + "pw", + "au", + "gg", + "", + "vc", + "", + "ga", + "om", + "", + "yu", + "", + "nf", + "pr", + "", + "zw", + "hm", + "", + "km", + "", "", "", + "fm", + "ir", + "dj", + "", + "um", + "io", + "", "", "", + "lu", + "er", + "gn", + "", "", + "kw", + "gi", + "nr", + "", "", + "id", + "no", + "pg", + "", "", + "hr", + "pa", + "kr", + "", "", "", + "fr", + "", "", "", + "fo", + "", "", "", "", + "za", + "eg", + "gl", + "", "", + "gp", + "", + "ng", + "", "", + "pk", + "na", + "pn", + "", "", "", "", + "kg", + "", "", "", "", + "in", + "", "", + "ug", + "vg", + "", "", + "ua", + "va", + "", "", "", "", "", "", + "gh", + "", "", "", + "ni", + "pl", + "hk", + "", + "hn", + "", + "kn", + "", + "fk", + "", + "ki", + "il", + "uk", + "", + "fi", + "vn", + "", "", "", + "vi", + "", "", "", "", "", + "gu", + "nl", + "", "", + "np", + "", "", "", "", + "fj", + "", + "ph", + "", "", + "kp", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", + "eh", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", + "kh", + "", "", "", "", "", "", "", "", "", + "", "", "", + "eu", + "", "", "", "", "", + "nu", + "", "", "", "", "", "", "", + "hu", + "", "", "", "", "", "", "", "", "", + "", + "vu" + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + int key = cctld_hash (str, len); + + if (key <= MAX_HASH_VALUE && key >= 0) + if (len == lengthtable[key]) + { + const char *s = wordlist[key]; + + if (*str == *s && !memcmp (str + 1, s + 1, len - 1)) + return s; + } + } + return 0; +} diff --git a/libclamav/iana_tld.h b/libclamav/iana_tld.h index e3fd17b08..f2568f675 100644 --- a/libclamav/iana_tld.h +++ b/libclamav/iana_tld.h @@ -1,28 +1,746 @@ -/* - * Phishing module: iana tld list. - * - * Copyright (C) 2007-2008 Sourcefire, Inc. - * - * Authors: Török Edvin - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - * MA 02110-1301, USA. - */ +/* ANSI-C code produced by gperf version 3.0.3 */ +/* Command-line: gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set */ +/* Computed positions: -k'1-2,6' */ -#ifndef IANA_TLD_H -#define IANA_TLD_H -#define iana_tld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|AERO|ARPA|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM|TRAVEL|XN--ZCKZAH|XN--0ZWM56D|XN--DEBA0AD|XN--G6W251D|XN--JXALPDLP|XN--KGBECHTV|XN--9T4B11YI5A|XN--80AKHBYKNJ4F|XN--11B5BS3A9AJ6G|XN--HGBK6AJ7F53BBA)" -#define iana_cctld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJLMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGHRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEFGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|IN[TT]|MIL|NET|ORG|PRO|TEL|AERO|ARP[AA]|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM)" +#ifa' == 97) && ('b' == 98) \ + && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ + && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ + && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ + && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ + && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ + && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ + && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to ." #endif +/* maximum key range = 983, duplicates = 0 */ + +#ifdef __GNUC__ +__inline +#else +#ifdef __cplusplus +inline +#endif +#endif +static unsigned int +tld_hash (const char *str, unsigned int len) +{ + static const unsigned short asso_values[] = + {}; + int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[5]]; + /*FALLTHROUGH*/ + case 5: + case 4: + case 3: + case 2: + hval += asso_values[(unsigned char)str[1]]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[(unsigned char)str[0]+25]; + break; + } + return hval; +} + +#ifdef __GNUC__ +__inline +#ifdef __GNUC_STDC_INLINE__ +__attribute__ ((__gnu_inline__)) +#endif +#endif +const char * +in_tld_set (const char *str, unsigned int len) +{ + enum + { + TOTAL_KEYWORDS = 280, + MIN_WORD_LENGTH = 2, + MAX_WORD_LENGTH = 18, + MIN_HASH_VALUE = 5, + MAX_HASH_VALUE = 987 + }; + + static const unsigned char lengthtable[] = + { + 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, + 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, + 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 3, 4, 2, 0, + 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, + 0, 4, 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 3, 0, + 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, + 4, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 0, 2, + 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 0, 0, + 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 0, 2, + 0, 2, 0, 0, 2, 0, 2, 3, 0, 2, 0, 0, 2, 0, + 2, 0, 2, 0, 0, 2, 0, 4, 2, 0, 2, 0, 2, 0, + 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, + 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 3, 0, 2, 0, + 2, 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 0, 0, 2, + 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, + 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 3, + 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 0, + 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, + 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, + 0, 2, 0, 0, 2, 6, 2, 0, 0, 0, 0, 2, 0, 0, + 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, + 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, + 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, + 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, + 0, 2, 0, 0, 2, 0, 2, 0, 6, 2, 0, 2, 0, 0, + 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, + 0, 2, 0, 2, 3, 0, 2, 0, 2, 0, 0, 2, 0, 2, + 0, 0, 0, 0, 2, 0, 0, 2, 11, 2, 0, 0, 0, 16, + 2, 0, 0, 0, 11, 2, 0, 0, 0, 0, 2, 0, 0, 0, + 0, 17, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, + 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 3, + 0, 2, 11, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, + 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, + 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 10, 0, 2, + 0, 2, 0, 0, 2, 0, 12, 0, 0, 2, 3, 2, 0, 0, + 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, + 0, 2, 0, 2, 18, 0, 2, 0, 2, 0, 0, 2, 0, 2, + 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, + 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, + 0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, + 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, + 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, + 0, 0, 2, 0, 12, 0, 0, 0, 0, 2, 18, 0, 0, 0, + 2, 3, 4, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, + 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, + 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 2, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, + 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, + 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2 + }; + static const char * const wordlist[] = + { + "", "", "", "", "", + "md", + "", "", + "mv", + "", + "cd", + "", + "mz", + "cv", + "", + "ad", + "", + "cz", + "", "", + "mu", + "", + "az", + "", "", + "cu", + "", + "nz", + "", "", + "au", + "", + "mo", + "", + "mobi", + "nu", + "", + "co", + "com", + "coop", + "fo", + "", + "ao", + "", "", + "ms", + "", + "no", + "", "", "", "", + "me", + "", "", + "as", + "", + "asia", + "", "", + "my", + "", + "ae", + "", + "aero", + "cy", + "", + "ne", + "net", + "", "", "", + "mr", + "", "", "", "", + "cr", + "", "", + "fr", + "", + "ar", + "", + "arpa", + "td", + "", + "nr", + "tv", + "", + "mc", + "", + "tz", + "", "", + "cc", + "", + "mx", + "", "", + "ac", + "", + "cx", + "lv", + "", + "nc", + "", + "ax", + "", "", "", "", + "to", + "", "", + "lu", + "", + "ml", + "", "", "", "", + "cl", + "org", + "", + "mh", + "", + "al", + "", "", + "ch", + "", + "nl", + "tel", + "", + "sd", + "", "", + "sv", + "", + "ls", + "", + "sz", + "", "", + "jo", + "", + "jobs", + "ru", + "", + "su", + "", + "tr", + "", "", + "ly", + "", "", "", "", + "ro", + "", + "so", + "", "", + "je", + "", + "lr", + "", "", + "tc", + "", + "ma", + "rs", + "", "", "", + "ca", + "cat", + "", + "re", + "", + "se", + "", "", + "lc", + "", + "na", + "", + "name", + "sy", + "", + "qa", + "", "", + "gd", + "", + "tl", + "", "", "", "", + "sr", + "", "", + "th", + "", + "mg", + "", "", + "gu", + "", + "cg", + "", "", "", "", + "ag", + "", "", + "sc", + "", + "ng", + "gov", + "", + "bd", + "", "", + "bv", + "", + "id", + "", + "bz", + "", "", + "gs", + "", "", "", "", + "mk", + "", + "ge", + "", "", + "ck", + "", + "sl", + "fk", + "", + "gy", + "", + "bo", + "", "", + "sh", + "", + "io", + "", "", "", "", + "gr", + "", "", + "bs", + "", + "la", + "", "", + "is", + "travel", + "be", + "", "", "", "", + "ie", + "", "", + "by", + "", "", "", "", + "mw", + "", + "tg", + "", "", "", "", + "br", + "", "", + "aw", + "", + "ir", + "", "", + "cf", + "", + "sa", + "", "", + "af", + "", + "gl", + "", "", + "nf", + "", "", "", "", + "gh", + "", "", "", "", + "tk", + "", + "mm", + "", "", + "yu", + "", + "cm", + "", "", + "fm", + "", + "am", + "", "", + "lk", + "", + "sg", + "", "", + "ps", + "", + "il", + "", + "museum", + "bh", + "", + "pe", + "", "", + "mq", + "", "", "", "", + "py", + "", + "ye", + "", "", + "aq", + "", + "ga", + "", "", + "tw", + "", + "pr", + "pro", + "", + "sk", + "", + "om", + "", "", + "tf", + "", + "mn", + "", "", "", "", + "cn", + "", "", + "ws", + "xn--g6w251d", + "an", + "", "", "", + "xn--80akhbyknj4f", + "ba", + "", "", "", + "xn--0zwm56d", + "gg", + "", "", "", "", + "tm", + "", "", "", "", + "xn--11b5bs3a9aj6g", + "", "", + "hu", + "", + "pl", + "rw", + "", + "mp", + "", + "uz", + "", "", + "ph", + "", "", "", "", + "lb", + "", + "bg", + "", "", + "np", + "", + "kz", + "mil", + "", + "jm", + "xn--deba0ad", + "ci", + "", "", + "fi", + "", + "ai", + "", "", "", "", + "ni", + "", "", + "us", + "", + "sm", + "", "", "", "", + "tn", + "", "", + "sb", + "", + "hr", + "", "", + "uy", + "", + "pa", + "", "", "", "", + "ke", + "xn--zckzah", + "", + "gw", + "", + "mt", + "", "", + "ky", + "", + "xn--jxalpdlp", + "", "", + "gf", + "edu", + "at", + "", "", + "vu", + "", + "kr", + "", "", + "tp", + "", + "dz", + "", "", + "eu", + "", + "pg", + "", "", + "bw", + "", + "sn", + "xn--hlcj6aya9esc7a", + "", + "fj", + "", + "gm", + "", "", + "bf", + "", + "do", + "", "", + "gb", + "", + "ve", + "", "", + "es", + "", + "li", + "jp", + "", "", "", + "ee", + "", "", + "pk", + "", + "de", + "", "", + "gq", + "", + "bm", + "", "", + "kh", + "", + "im", + "", "", + "bb", + "", + "er", + "", "", "", "", + "tt", + "", "", + "vc", + "", + "si", + "", "", "", "", + "gn", + "", "", + "ec", + "", + "lt", + "", "", + "iq", + "", + "ua", + "", "", + "pw", + "", + "tj", + "", "", "", "", + "za", + "", "", + "pf", + "", + "xn--kgbechtv", + "", "", "", "", + "bn", + "xn--hgbk6aj7f53bba", + "", "", "", + "in", + "int", + "info", + "gp", + "", + "st", + "", "", "", "", + "ug", + "", "", "", "", + "pm", + "", "", "", "", + "gi", + "", "", "", "", + "kg", + "", "", + "hk", + "", + "sj", + "", "", + "wf", + "", "", "", "", "", "", + "va", + "", "", + "uk", + "", "", "", "", "", "", + "bi", + "biz", + "", "", "", "", "", "", "", "", "", + "", "", "", "", + "gt", + "", "", "", "", + "pn", + "", "", "", "", + "vg", + "", "", "", "", "", "", "", "", "", + "eg", + "", "", "", "", "", "", "", "", "", + "bt", + "", "", + "zw", + "", + "it", + "", "", + "kw", + "", "", "", "", "", "", + "hm", + "", "", "", "", "", "", "", "", "", + "bj", + "", "", + "dk", + "", "", "", "", "", "", "", "", "", + "", "", + "zm", + "", "", "", "", + "km", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", + "hn", + "", "", "", "", + "pt", + "", "", "", "", "", "", "", "", "", + "yt", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", + "kn", + "", "", "", "", "", "", "", "", "", + "dm", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "kp", + "", "", "", "", "", "", "", "", "", + "", "", + "vn", + "", "", "", "", + "ki", + "", "", "", "", "", "", "", "", "", + "", "", + "xn--9t4b11yi5a", + "", "", + "ht", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", + "vi", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "et", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", + "", "", + "dj" + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + int key = tld_hash (str, len); + + if (key <= MAX_HASH_VALUE && key >= 0) + if (len == lengthtable[key]) + { + const char *s = wordlist[key]; + + if (*str == *s && !memcmp (str + 1, s + 1, len - 1)) + return s; + } + } + return 0; +} diff --git a/libclamav/phish_domaincheck_db.c b/libclamav/phish_domaincheck_db.c index c5efe4922..5da9cc40f 100644 --- a/libclamav/phish_domaincheck_db.c +++ b/libclamav/phish_domaincheck_db.c @@ -49,16 +49,6 @@ int domainlist_match(const struct cl_engine* engine,char* real_url,const char* d { const char* info; int rc = engine->domainlist_matcher ? regex_list_match(engine->domainlist_matcher,real_url,display_url,hostOnly ? pre_fixup : NULL,hostOnly,&info,0) : 0; - if(rc && info && info[0] && info[0] != ':') {/*match successful, and has custom flags*/ - if(strlen(info)==3 && isxdigit(info[0]) && isxdigit(info[1]) && isxdigit(info[2])) { - unsigned short notwantedflags=0; - sscanf(info,"%hx",¬wantedflags); - *flags &= ~notwantedflags;/* filter unwanted phishcheck flags */ - } - else { - cli_warnmsg("Phishcheck:Unknown flag format in domain-list, 3 hex digits expected"); - } - } return rc; } @@ -79,13 +69,6 @@ int is_domainlist_ok(const struct cl_engine* engine) return (engine && engine->domainlist_matcher) ? is_regex_ok(engine->domainlist_matcher) : 1; } -void domainlist_cleanup(const struct cl_engine* engine) -{ - if(engine && engine->domainlist_matcher) { - regex_list_cleanup(engine->domainlist_matcher); - } -} - void domainlist_done(struct cl_engine* engine) { if(engine && engine->domainlist_matcher) { diff --git a/libclamav/phish_whitelist.c b/libclamav/phish_whitelist.c index a9bbbed03..55fd2fd52 100644 --- a/libclamav/phish_whitelist.c +++ b/libclamav/phish_whitelist.c @@ -69,13 +69,6 @@ int is_whitelist_ok(const struct cl_engine* engine) return (engine && engine->whitelist_matcher) ? is_regex_ok(engine->whitelist_matcher) : 1; } -void whitelist_cleanup(const struct cl_engine* engine) -{ - if(engine && engine->whitelist_matcher) { - regex_list_cleanup(engine->whitelist_matcher); - } -} - void whitelist_done(struct cl_engine* engine) { if(engine && engine->whitelist_matcher) { diff --git a/libclamav/phishcheck.c b/libclamav/phishcheck.c index 4589eb5d4..ba8cb57e1 100644 --- a/libclamav/phishcheck.c +++ b/libclamav/phishcheck.c @@ -39,6 +39,7 @@ #include #include "clamav.h" +#include "cltypes.h" #include "others.h" #include "mbox.h" #include "message.h" @@ -47,6 +48,7 @@ #include "phish_domaincheck_db.h" #include "phish_whitelist.h" #include "iana_tld.h" +#include "iana_cctld.h" #define DOMAIN_REAL 1 @@ -140,8 +142,6 @@ static char empty_string[]=""; #define CLOAKED_URL "^"ANY_CLOAK"(\\."ANY_CLOAK"){0,3}$" static const char cloaked_host_regex[] = CLOAKED_URL; -static const char tld_regex[] = "^"iana_tld"$"; -static const char cctld_regex[] = "^"iana_cctld"$"; static const char dotnet[] = ".net"; static const char adonet[] = "ado.net"; static const char aspnet[] = "asp.net"; @@ -151,7 +151,10 @@ static const char gt[]=">"; static const char src_text[] = "src"; static const char href_text[] = "href"; static const char mailto[] = "mailto:"; +static const char mailto_proto[] = "mailto://"; static const char https[]="https://"; +static const char http[]="http://"; +static const char ftp[] = "ftp://"; static const size_t href_text_len = sizeof(href_text); static const size_t src_text_len = sizeof(src_text); @@ -161,7 +164,10 @@ static const size_t aspnet_len = sizeof(aspnet)-1; static const size_t lt_len = sizeof(lt)-1; static const size_t gt_len = sizeof(gt)-1; static const size_t mailto_len = sizeof(mailto)-1; +static const size_t mailto_proto_len = sizeof(mailto_proto)-1; static const size_t https_len = sizeof(https)-1; +static const size_t http_len = sizeof(http)-1; +static const size_t ftp_len = sizeof(ftp)-1; /* for urls, including mailto: urls, and (broken) http:www... style urls*/ /* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html @@ -169,41 +175,13 @@ static const size_t https_len = sizeof(https)-1; * So the 'safe' char class has been split up * */ /* character classes */ -#define URI_alpha "a-zA-Z" #define URI_digit "0-9" -#define URI_safe_nodot "-$_@&" -#define URI_safe "-$_@.&" -#define URI_extra "!*\"'()," - -#define URI_hex "[0-9a-fA-f]" -#define URI_escape "%"URI_hex"{2}" -#define URI_xalpha "([" URI_safe URI_alpha URI_digit URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */ -#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")" - -#define URI_xalphas_nodot URI_xalpha_nodot"*" - -#define URI_ialpha "["URI_alpha"]"URI_xalphas_nodot"" -#define URI_xpalpha URI_xalpha"|\\+" -#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+" -#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+" - -#define URI_scheme URI_ialpha -#define URI_tld iana_tld -#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*" - #define URI_IP_digits "["URI_digit"]{1,3}" #define URI_path_start "[/?:]?" #define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start -#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path +#define URI_numeric_URI "(http|https|ftp:(//)?)?"URI_numeric_path #define URI_numeric_fragmentaddress URI_numeric_URI -#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1 -#define URI_URI2 URI_tld - -#define URI_fragmentaddress1 URI_URI1 -#define URI_fragmentaddress2 URI_URI2""URI_path_start - -#define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+" /*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please. * there is fragmentaddress1, and 2 to work around the ISO limitation of 509 bytes max length for string constants*/ @@ -235,7 +213,6 @@ static int string_assign_concatenated(struct string* dest, const char* prefix, c static void string_assign_null(struct string* dest); static char *rfind(char *start, char c, size_t len); static char hex2int(const unsigned char* src); -static int isTLD(const struct phishcheck* pchk,const char* str,int len); static enum phish_status phishingCheck(const struct cl_engine* engine,struct url_check* urls); static const char* phishing_ret_toString(enum phish_status rc); @@ -416,7 +393,7 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p } tld = strrchr(realhost,'.'); - rc = tld ? isTLD(s,tld,tld-realhost-1) : 0; + rc = tld ? !!in_tld_set(tld,tld-realhost-1) : 0; if(rc < 0) return rc; if(rc) @@ -438,28 +415,6 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p return 0; } -static int isCountryCode(const struct phishcheck* s,const char* str) -{ - return str ? !cli_regexec(&s->preg_cctld,str,0,NULL,0) : 0; -} - -static int isTLD(const struct phishcheck* pchk,const char* str,int len) -{ - if (!str) - return 0; - else { - char* s = cli_malloc(len+1); - int rc; - - if(!s) - return CL_EMEM; - strncpy(s,str,len); - s[len]='\0'; - rc = !cli_regexec(&pchk->preg_tld,s,0,NULL,0); - free(s); - return rc ? 1 : 0; - } -} /* * memrchr isn't standard, so I use this @@ -486,7 +441,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct string_assign(dest,host); return; } - if(isCountryCode(pchk,tld+1)) { + if(in_cctld_set(tld+1, strlen(tld+1))) { const char* countrycode = tld+1; tld = rfind(host->data,'.',tld-host->data-1); if(!tld) { @@ -495,7 +450,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct string_assign(dest,host); return; } - if(!isTLD(pchk,tld+1,countrycode-tld-2)) { + if(!in_tld_set(tld+1, countrycode-tld-2)) { string_assign_ref(dest,host,tld+1); return;/*it was a name like: subdomain.domain.uk, return domain.uk*/ } @@ -737,11 +692,7 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal) /* @end points to last character we want to be part of the URL */ end = host_begin + host_len - 1; } - /* terminate URL with a slash, except when we're at end of string */ - if(host_begin[host_len]) { - host_begin[host_len] = '/'; - end++; - } + host_begin[host_len] = '\0'; /* convert hostname to lowercase, but only hostname! */ str_make_lowercase(host_begin, host_len); /* some broken MUAs put > in the href, and then @@ -797,6 +748,40 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs) if(!ctx->found_possibly_unwanted) *ctx->virname=NULL; +#if 0 + FILE *f = fopen("/home/edwin/quarantine/urls","r"); + if(!f) + abort(); + while(!feof(f)) { + struct url_check urls; + char line1[4096]; + char line2[4096]; + char line3[4096]; + + fgets(line1, sizeof(line1), f); + fgets(line2, sizeof(line2), f); + fgets(line3, sizeof(line3), f); + if(strcmp(line3, "\n") != 0) { + strcpy(line1, line2); + strcpy(line2, line3); + fgets(line3, sizeof(line3), f); + while(strcmp(line3, "\n") != 0) { + fgets(line3, sizeof(line3),f); + } + } + urls.flags = CL_PHISH_ALL_CHECKS; + urls.link_type = 0; + string_init_c(&urls.realLink, line1); + string_init_c(&urls.displayLink, line2); + string_init_c(&urls.pre_fixup.pre_displayLink, NULL); + urls.realLink.refcount=-1; + urls.displayLink.refcount=-1; + int rc = phishingCheck(ctx->engine, &urls); + //printf("%d\n",rc); + } + fclose(f); + return 0; +#endif for(i=0;icount;i++) if(hrefs->contents[i]) { struct url_check urls; @@ -928,44 +913,7 @@ int phishing_init(struct cl_engine* engine) return CL_EFORMAT; } - if(build_regex(&pchk->preg_cctld,cctld_regex,1)) { - free(pchk); - engine->phishcheck = NULL; - return CL_EFORMAT; - } - if(build_regex(&pchk->preg_tld,tld_regex,1)) { - free_regex(&pchk->preg_cctld); - free(pchk); - engine->phishcheck = NULL; - return CL_EFORMAT; - } - url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$"); - if(!url_regex || build_regex(&pchk->preg,url_regex,1)) { - free_regex(&pchk->preg_cctld); - free_regex(&pchk->preg_tld); - free(url_regex); - free(pchk); - engine->phishcheck = NULL; - return CL_EFORMAT; - } - free(url_regex); - realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$"); - if(!realurl_regex || build_regex(&pchk->preg_realurl, realurl_regex,1)) { - free_regex(&pchk->preg_cctld); - free_regex(&pchk->preg_tld); - free_regex(&pchk->preg); - free(url_regex); - free(realurl_regex); - free(pchk); - engine->phishcheck = NULL; - return CL_EFORMAT; - } - free(realurl_regex); if(build_regex(&pchk->preg_numeric,numeric_url_regex,1)) { - free_regex(&pchk->preg_cctld); - free_regex(&pchk->preg_tld); - free_regex(&pchk->preg); - free_regex(&pchk->preg_realurl); free(pchk); engine->phishcheck = NULL; return CL_EFORMAT; @@ -980,12 +928,8 @@ void phishing_done(struct cl_engine* engine) struct phishcheck* pchk = engine->phishcheck; cli_dbgmsg("Cleaning up phishcheck\n"); if(pchk && !pchk->is_disabled) { - free_regex(&pchk->preg); free_regex(&pchk->preg_hexurl); - free_regex(&pchk->preg_cctld); - free_regex(&pchk->preg_tld); free_regex(&pchk->preg_numeric); - free_regex(&pchk->preg_realurl); pchk->is_disabled = 1; } whitelist_done(engine); @@ -998,22 +942,165 @@ void phishing_done(struct cl_engine* engine) cli_dbgmsg("Phishcheck cleaned up\n"); } + +/*ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*/ +static const uint8_t URI_alpha[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/*!"$%&'()*,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/ +static const uint8_t URI_xalpha_nodot[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/*!"$%&'()*+,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/ +static const uint8_t URI_xpalpha_nodot[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static inline int validate_uri_xalphas_nodot(const char *start, const char *end) +{ + const unsigned char *p = start; + for(p=start;p < (const unsigned char*)end; p++) { + if(!URI_xalpha_nodot[*p]) + return 0; + } + return 1; +} + +static inline int validate_uri_xpalphas_nodot(const char *start, const char *end) +{ + const unsigned char *p = start; + for(p=start;p < (const unsigned char*)end; p++) { + if(!URI_xpalpha_nodot[*p]) + return 0; + } + /* must have at least on char */ + return p > (const unsigned char*)start; +} + + +static inline int validate_uri_ialpha(const char *start, const char *end) +{ + const unsigned char *p = start; + if(start >= end || !URI_alpha[*p]) + return 0; + return validate_uri_xalphas_nodot(start + 1, end); +} + /* * Only those URLs are identified as URLs for which phishing detection can be performed. */ -static int isURL(const struct phishcheck* pchk,const char* URL) +static int isURL(const struct phishcheck* pchk,const char* URL, int accept_anyproto) { - return URL ? !cli_regexec(&pchk->preg,URL,0,NULL,0) : 0; + const char *start = NULL, *p, *q; + if(!URL) + return 0; + + switch (URL[0]) { + case 'h': + if (strncmp(URL, https, https_len) == 0) + start = URL + https_len; + else if (strncmp(URL, http, http_len) == 0) + start = URL + http_len; + break; + case 'f': + if (strncmp(URL, ftp, ftp_len) == 0) + start = URL + ftp_len; + break; + case 'm': + if (strncmp(URL, mailto_proto, mailto_proto_len) == 0) + start = URL + mailto_proto_len; + break; + } + if(start) { + if(start[0] == '\0') + return 0;/* empty URL */ + /* has a valid protocol, it is a URL */ + return 1; + } + start = accept_anyproto ? strchr(URL, ':') : NULL; + if(start) { + /* validate URI scheme */ + if(validate_uri_ialpha(URL, start)) { + if(start[1] == '/' && start[2] == '/') + start += 3; /* skip :// */ + else + start++; + } + else + start = URL; /* scheme invalid */ + } else + start = URL; + p = start; + do { + q = strchr(p, '.'); + if(q) { + if(!validate_uri_xpalphas_nodot(p, q)) + return 0; + p = q+1; + } + } while(q); + if (p == start) /* must have at least one dot in the URL */ + return 0; + return !!in_tld_set(p, strlen(p)); } /* * Check if this is a real URL, which basically means to check if it has a known URL scheme (http,https,ftp). * This prevents false positives with outbind:// and blocked:: links. */ +#if 0 static int isRealURL(const struct phishcheck* pchk,const char* URL) { return URL ? !cli_regexec(&pchk->preg_realurl,URL,0,NULL,0) : 0; } +#endif static int isNumericURL(const struct phishcheck* pchk,const char* URL) { @@ -1139,7 +1226,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data, urls->displayLink.data); - if((!isURL(pchk, urls->displayLink.data) || !isRealURL(pchk, urls->realLink.data) ) && + if((!isURL(pchk, urls->displayLink.data, 1) || !isURL(pchk, urls->realLink.data, 0) ) && ( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) || !(phishy&PHISHY_NUMERIC_IP))) { cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data); diff --git a/libclamav/phishcheck.h b/libclamav/phishcheck.h index cb4bff581..822d3ff64 100644 --- a/libclamav/phishcheck.h +++ b/libclamav/phishcheck.h @@ -44,10 +44,6 @@ struct string { }; struct phishcheck { - regex_t preg; - regex_t preg_realurl; - regex_t preg_tld; - regex_t preg_cctld; regex_t preg_numeric; regex_t preg_hexurl; int is_disabled; diff --git a/libclamav/readdb.c b/libclamav/readdb.c index d95d80a17..ef77469d0 100644 --- a/libclamav/readdb.c +++ b/libclamav/readdb.c @@ -1839,6 +1839,12 @@ int cl_build(struct cl_engine *engine) } } + if((ret = cli_build_regex_list(engine->whitelist_matcher))) { + return ret; + } + if((ret = cli_build_regex_list(engine->domainlist_matcher))) { + return ret; + } cli_md5db_build(engine->md5_mdb); cli_freeign(engine); cli_dconf_print(engine->dconf); diff --git a/libclamav/regex_list.c b/libclamav/regex_list.c index d33bc7b10..f4400f83e 100644 --- a/libclamav/regex_list.c +++ b/libclamav/regex_list.c @@ -42,6 +42,8 @@ #include #include +#include + #include "regex/regex.h" @@ -53,152 +55,471 @@ #include "matcher.h" #include "str.h" #include "readdb.h" +#include "jsparse/textbuf.h" -/*Tree*/ -enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE}; -typedef unsigned char* char_bitmap_p; -/* - * - * OP_CHAR: 1 character, c = character - * complex stuff: - * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name) - * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class - * OP_DOT: single . matching any character except \n - * OP_LEAF: this is a leaf node, reinterpret structure - */ -struct tree_node { - struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/ +/* ------- parse a regular expression, and extract a static suffix ------*/ +enum node_type { + root=0, + concat, + alternate, /* | */ + optional,/* ?, * */ + leaf, /* a character */ + leaf_class /* character class */ + /* (x)+ is transformed into (x)*(x) */ +}; + +struct node { + enum node_type type; + struct node *parent; union { - struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */ - char_bitmap_p* bitmap; - struct leaf_info* leaf; + struct { + struct node* left; + struct node* right; + } children; + uint8_t* leaf_class_bitmap; + uint8_t leaf_char; } u; - enum token_op_t op; - unsigned char c; - char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/ - char listend;/* no more siblings, next pointer is pointer to parent*/ }; -struct leaf_info { - char* info;/* what does it mean that we reached the leaf...*/ - regex_t* preg;/* this is NULL if leaf node, and non-regex*/ -}; - -/* Character classes */ -static const char* std_class[] = { - "[:alnum:]", - "[:digit:]", - "[:punct:]", - "[:alpha:]", - "[:graph:]", - "[:space:]", - "[:blank:]", - "[:lower:]", - "[:upper:]", - "[:cntrl:]", - "[:print:]", - "[:xdigit:]" - /* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/ -}; - - -#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0]) - -/* generated by contrib/phishing/generate_tables.c */ -static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = { - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, - 0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, - 0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} -}; - -static const unsigned short int char_class[256] = { - 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200, - 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, - 0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, - 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, - 0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, - 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414, - 0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, - 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000 -}; - -static const size_t std_class_cnt = sizeof(std_class)/sizeof(std_class[0]); - /* Prototypes */ -static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly); -static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info); -static void destroy_tree(struct regex_matcher* matcher); -static struct tree_node* tree_root_alloc(void); -static int build_regex_list(struct regex_matcher* matcher); -static void stack_destroy(struct node_stack* stack); +static size_t reverse_string(char *pattern); +static int add_pattern(struct regex_matcher *matcher, char *pattern); +static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex); +static int add_static_pattern(struct regex_matcher *matcher, char* pattern); +static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf); +/* ---------- */ + +static uint8_t dot_bitmap[32]; + +static struct node* make_node(enum node_type type, struct node *left, struct node *right) +{ + struct node *n; + if(type == concat) { + if(left == NULL) + return right; + if(right == NULL) + return left; + } + n = cli_malloc(sizeof(*n)); + if(!n) + return NULL; + n->type = type; + n->parent = NULL; + n->u.children.left = left; + n->u.children.right = right; + if(left) + left->parent = n; + if(right) + right->parent = n; + return n; +} + +static struct node *dup_node(struct node *p) +{ + struct node *node_left, *node_right; + struct node *d; + + if(!p) + return NULL; + d = cli_malloc(sizeof(*d)); + if(!d) + return NULL; + d->type = p->type; + d->parent = NULL; + switch(p->type) { + case leaf: + d->u.leaf_char = p->u.leaf_char; + break; + case leaf_class: + d->u.leaf_class_bitmap = cli_malloc(32); + if(!d->u.leaf_class_bitmap) + return NULL; + memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32); + break; + default: + node_left = dup_node(p->u.children.left); + node_right = dup_node(p->u.children.right); + d->u.children.left = node_left; + d->u.children.right = node_right; + if(node_left) + node_left->parent = d; + if(node_right) + node_right->parent = d; + break; + } + return d; +} + +static struct node *make_charclass(uint8_t *bitmap) +{ + struct node *v = cli_malloc(sizeof(*v)); + if(!v) + return NULL; + v->type = leaf_class; + v->parent = NULL; + v->u.leaf_class_bitmap = bitmap; + return v; +} + +static struct node *make_leaf(char c) +{ + struct node *v = cli_malloc(sizeof(*v)); + if(!v) + return NULL; + v->type = leaf; + v->parent = NULL; + v->u.leaf_char = c; + return v; +} + +static void destroy_tree(struct node *n) +{ + if(!n) + return; + switch(n->type) { + case concat: + case alternate: + case optional: + destroy_tree(n->u.children.left); + destroy_tree(n->u.children.right); + break; + case leaf_class: + if(n->u.leaf_class_bitmap != dot_bitmap) + free(n->u.leaf_class_bitmap); + break; + case root: + case leaf: + break; + } + free(n); +} + +static uint8_t* parse_char_class(const char *pat, size_t *pos) +{ + unsigned char range_start=0; + int hasprev = 0; + uint8_t* bitmap = cli_malloc(32); + if(!bitmap) + return NULL; + if (pat[*pos]=='^') { + memset(bitmap,0xFF,32);/*match chars not in brackets*/ + ++*pos; + } + else + memset(bitmap,0x00,32); + do { + /* literal ] can be first character, so test for it at the end of the loop, for example: []] */ + if (pat[*pos]=='-' && hasprev) { + /* it is a range*/ + unsigned char range_end; + unsigned int c; + assert(range_start); + ++*pos; + if (pat[*pos]=='[') + if (pat[*pos+1]=='.') { + /* collating sequence not handled */ + free(bitmap); + /* we are parsing the regex for a + * filter, be conservative and + * tell the filter that anything could + * match here */ + while(pat[*pos] != ']') ++*pos; + ++*pos; + while(pat[*pos] != ']') ++*pos; + return dot_bitmap; + } + else + range_end = pat[*pos]; + else + range_end = pat[*pos]; + for(c=range_start+1;c<=range_end;c++) + bitmap[c>>3] ^= 1<<(c&0x7); + hasprev = 0; + } + else if (pat[*pos]=='[' && pat[*pos]==':') { + /* char class */ + free(bitmap); + while(pat[*pos] != ']') ++*pos; + ++*pos; + while(pat[*pos] != ']') ++*pos; + return dot_bitmap; + } else { + bitmap[pat[*pos]>>3] ^= 1<<(pat[*pos]&0x7); + ++*pos; + range_start = pat[*pos]; + hasprev = 1; + } + } while(pat[*pos]!=']'); + return bitmap; +} + +static struct node* parse_regex(const char *p, size_t *last) +{ + struct node *v = NULL; + struct node *right; + struct node *tmp; + + while(p[*last] != '$' && p[*last] != '\0') { + switch(p[*last]) { + case '|': + ++*last; + right = parse_regex(p, last); + v = make_node(alternate, v, right); + if(!v) + return NULL; + break; + case '*': + case '?': + v = make_node(optional, v, NULL); + if(!v) + return NULL; + ++*last; + break; + case '+': + /* (x)* */ + tmp = make_node(optional, v, NULL); + if(!tmp) + return NULL; + /* (x) */ + right = dup_node(v); + if(!right) + return NULL; + /* (x)*(x) => (x)+ */ + v = make_node(concat, tmp, right); + if(!v) + return NULL; + ++*last; + break; + case '(': + ++*last; + right = parse_regex(p, last); + if(!right) + return NULL; + ++*last; + v = make_node(concat, v, right); + break; + case ')': + return v; + case '.': + right = make_charclass(dot_bitmap); + if(!right) + return NULL; + v = make_node(concat, v, right); + if(!v) + return NULL; + ++*last; + break; + case '[': + right = make_charclass( parse_char_class(p, last) ); + if(!right) + return NULL; + v = make_node(concat, v, right); + if(!v) + return NULL; + case '\\': + /* next char is escaped, advance pointer + * and let fall-through handle it */ + ++*last; + default: + right = make_leaf(p[*last]); + v = make_node(concat, v, right); + if(!v) + return NULL; + ++*last; + break; + } + } + return v; +} + +#define BITMAP_HASSET(b, i) (b[i>>3] & (1<<(i&7))) + +static int build_suffixtree_ascend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf, struct node *prev) +{ + size_t i; + while(n) { + struct node *q = n; + switch(n->type) { + case root: + textbuffer_putc(buf, '\0'); + if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) + return CL_EMEM; + return 0; + case leaf: + textbuffer_putc(buf, n->u.leaf_char); + n = n->parent; + break; + case leaf_class: + if(memcmp(n->u.leaf_class_bitmap, dot_bitmap, sizeof(dot_bitmap)) == 0) { + textbuffer_putc(buf, '\0'); + if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) + return CL_EMEM; + return 0; + } + for(i=0;i<255;i++) { + if(BITMAP_HASSET(n->u.leaf_class_bitmap, i)) { + size_t pos; + pos = buf->pos; + textbuffer_putc(buf, i); + if(build_suffixtree_ascend(matcher, regex, n->parent, buf, n) < 0) + return CL_EMEM; + buf->pos = pos; + } + } + return 0; + case concat: + if(prev != n->u.children.left) { + if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0) + return CL_EMEM; + /* we're done here, descend will call + * ascend if needed */ + return 0; + } else { + n = n->parent; + } + break; + case alternate: + n = n->parent; + break; + case optional: + textbuffer_putc(buf, '\0'); + if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) + return CL_EMEM; + return 0; + } + prev = q; + } + return 0; +} + +static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf) +{ + size_t pos; + while(n && n->type == concat) { + n = n->u.children.right; + } + if(!n) + return 0; + /* find out end of the regular expression, + * if it ends with a static pattern */ + switch(n->type) { + case alternate: + /* save pos as restart point */ + pos = buf->pos; + if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0) + return CL_EMEM; + buf->pos = pos; + if(build_suffixtree_descend(matcher, regex, n->u.children.right, buf) < 0) + return CL_EMEM; + buf->pos = pos; + break; + case optional: + textbuffer_putc(buf, '\0'); + if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) + return CL_EMEM; + return 0; + case leaf: + case leaf_class: + if(build_suffixtree_ascend(matcher, regex, n, buf, NULL) < 0) + return CL_EMEM; + return 0; + default: + break; + } + return 0; +} + + +/* ----- shift-or filtering -------------- */ + +#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f))) +#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f))) + +static void SO_init(struct filter *m) +{ + memset(m->B, ~0, sizeof(m->B)); + memset(m->end, ~0, sizeof(m->end)); + memset(m->end_fast, ~0, sizeof(m->end_fast)); +} + +/* because we use uint32_t */ +#define MAXSOPATLEN 32 + +/* merge another pattern into the filter + * add('abc'); add('bcd'); will match [ab][bc][cd] */ +static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len) +{ + uint16_t q; + uint8_t j; + + /* cut length, and make it modulo 2 */ + if(len > MAXSOPATLEN) { + len = MAXSOPATLEN; + } else { + /* we use 2-grams, must be multiple of 2 */ + len = len & ~1; + } + if(!len) + return 0; + + /* Shift-Or like preprocessing */ + for(j=0;j < len-1;j++) { + /* use overlapping 2-grams. We need them overlapping because matching can start at any position */ + q = cli_readint16( &pattern[j] ); + m->B[q] &= ~(1 << j); + } + /* we use variable length patterns, use last character to mark pattern end, + * can lead to false positives.*/ + /* mark that at state j, the q-gram q can end the pattern */ + if(j) { + j--; + m->end[q] &= ~(1 << j); + m->end_fast[pattern[j]] &= (1<B; + const uint32_t *End = m->end; + const uint32_t *EndFast = m->end_fast; + + if(!len) return -1; + /* Shift-Or like search algorithm */ + for(j=0;j < len-1; j++) { + const uint16_t q0 = cli_readint16( &data[j] ); + uint32_t match_end; + state = (state << 1) | B[q0]; + /* state marks with a 0 bit all active states + * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern + * if we got two 0's at matching positions, it means we encountered a pattern's end */ + match_end = state | EndFast[data[j+1]]; + if((match_end != 0xffffffff) && (state | End[q0]) != 0xffffffff) { + /* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if + * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working + * set inside L2 */ + + /* if state is reachable, and this character can finish a pattern, assume match */ + /* to reduce false positives check if qgram can finish the pattern */ + /* return position of probable match */ + /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the + * longest pattern that could match */ + return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0; + } + } + /* no match */ + return -1; +} + +/* ----------------------------------------------------------- */ -#ifndef NDEBUG -void dump_tree(struct tree_node* root); -#endif #define MATCH_SUCCESS 0 #define MATCH_FAILED -1 @@ -233,6 +554,43 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0]; } +static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url) +{ + char c; + const char *matched; + size_t match_len; + + if(!regex || !regex->pattern) + return 0; + match_len = strlen(regex->pattern); + if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') && + (match_len == buffer_len || /* full match */ + (match_len < buffer_len && + ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) + /* subdomain matched*/)) { + cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern); + cli_dbgmsg("Before inserting .: %s\n", orig_real_url); + if(real_len >= match_len + 1) { + const size_t pos = real_len - match_len - 1; + if(real_url[pos] != '.') { + /* we need to shift left, and insert a '.' + * we have an extra '.' at the beginning inserted by get_host to have room, + * orig_real_url has to be used here, + * because we want to overwrite that extra '.' */ + size_t orig_real_len = strlen(orig_real_url); + cli_dbgmsg("No dot here:%s\n",real_url+pos); + real_url = orig_real_url; + memmove(real_url, real_url+1, orig_real_len-match_len-1); + real_url[orig_real_len-match_len-1]='.'; + cli_dbgmsg("After inserting .: %s\n", real_url); + } + } + return 1; + } + cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c); + return 0; +} + /* * @matcher - matcher structure to use * @real_url - href target @@ -246,24 +604,28 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info * Do not send NULL pointers to this function!! * */ -int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist) +int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist) { char* orig_real_url = real_url; - massert(matcher); - massert(real_url); - massert(display_url); - massert(info); + const char *vinfo; + struct regex_list *regex; + + assert(matcher); + assert(real_url); + assert(display_url); + *info = NULL; if(!matcher->list_inited) return 0; - massert(matcher->list_built); + assert(matcher->list_built); /* skip initial '.' inserted by get_host */ if(real_url[0] == '.') real_url++; if(display_url[0] == '.') display_url++; { size_t real_len = strlen(real_url); size_t display_len = strlen(display_url); - size_t buffer_len = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0); - char* buffer = cli_malloc(buffer_len+1); + size_t buffer_len = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1; + char *buffer = cli_malloc(buffer_len+1); + char *bufrev; size_t i; int rc = 0; struct cli_ac_data mdata; @@ -272,61 +634,48 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di return CL_EMEM; strncpy(buffer,real_url,real_len); - buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':'; + buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':'; if(!hostOnly || is_whitelist) { strncpy(buffer+real_len+1,display_url,display_len); - if(is_whitelist) - buffer[buffer_len - 1] = '/'; - buffer[buffer_len]=0; } + buffer[buffer_len - 1] = '/'; + buffer[buffer_len]=0; cli_dbgmsg("Looking up in regex_list: %s\n", buffer); - if(hostOnly) { - if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN))) - return rc; - rc = 0; + if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN))) + return rc; - for(i = 0; i < matcher->root_hosts_cnt; i++) { - /* doesn't need to match terminating \0*/ - rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL); - cli_ac_freedata(&mdata); - if(rc) { - char c; - const char* matched = strchr(*info,':'); - const size_t match_len = matched ? strlen(matched+1) : 0; - if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') && - (match_len == buffer_len || /* full match */ - (match_len < buffer_len && - ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) - /* subdomain matched*/)) { + bufrev = cli_strdup(buffer); + if(!bufrev) + return CL_EMEM; + reverse_string(bufrev); + rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1; + if(!rc) { + /* filter says this suffix doesn't match. + * The filter has false positives, but no false + * negatives */ + return 0; + } - cli_dbgmsg("Got a match: %s with %s\n", buffer, *info); - cli_dbgmsg("Before inserting .: %s\n", orig_real_url); - if(real_len >= match_len + 1) { - const size_t pos = real_len - match_len - 1; - if(real_url[pos] != '.') { - /* we need to shift left, and insert a '.' - * we have an extra '.' at the beginning inserted by get_host to have room, - * orig_real_url has to be used here, - * because we want to overwrite that extra '.' */ - size_t orig_real_len = strlen(orig_real_url); - cli_dbgmsg("No dot here:%s\n",real_url+pos); - real_url = orig_real_url; - memmove(real_url, real_url+1, orig_real_len-match_len-1); - real_url[orig_real_len-match_len-1]='.'; - cli_dbgmsg("After inserting .: %s\n", real_url); - } - } - break; - } - cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, *info, c); - rc=0; + rc = cli_ac_scanbuff((unsigned char*)bufrev,buffer_len, &vinfo, &matcher->suffixes,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL); + cli_ac_freedata(&mdata); + + if(rc) { + /* TODO loop over multiple virusnames here */ + regex = (struct regex_list*)vinfo; + do { + /* loop over multiple regexes corresponding to + * this suffix */ + if (!regex->preg.re_magic) { + /* we matched a static pattern */ + rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url); + } else { + rc = !cli_regexec(®ex->preg, buffer, 0, NULL, 0); } - } - } else - rc = 0; - if(!rc) - rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS; + if(rc) *info = regex->pattern; + regex = regex->nxt; + } while(!rc && regex); + } free(buffer); if(!rc) cli_dbgmsg("Lookup result: not in regex list\n"); @@ -336,56 +685,6 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di } } -/* node stack */ -#define NODE_STACK_INITIAL 1024 -#define NODE_STACK_GROW 4096 -/* Initialize @stack */ -static int stack_init(struct node_stack* stack) -{ - massert(stack); - - stack->cnt = 0; - stack->capacity = NODE_STACK_INITIAL; - stack->data = cli_malloc(stack->capacity * sizeof(*stack->data)); - if(!stack->data) - return CL_EMEM; - else - return CL_SUCCESS; -} - -/* Reset @stack pointer, but don't realloc */ -static void stack_reset(struct node_stack* stack) -{ - massert(stack); - - stack->cnt = 0; -} - -/* Push @node on @stack, growing it if necessarry */ -static int stack_push(struct node_stack* stack,struct tree_node* node) -{ - massert(stack); - massert(stack->data); - - if(stack->cnt == stack->capacity) { - stack->capacity += NODE_STACK_GROW; - stack->data = cli_realloc2(stack->data,stack->capacity*sizeof(*stack->data)); - if(!stack->data) - return CL_EMEM; - } - stack->data[stack->cnt++] = node; - return CL_SUCCESS; -} - -/* Pops node from @stack, doesn't realloc */ -static struct tree_node* stack_pop(struct node_stack* stack) -{ - massert(stack); - massert(stack->data); - massert(stack->cnt);/*don't pop from empty stack */ - - return stack->cnt ? stack->data[--stack->cnt] : NULL; -} /* Initialization & loading */ /* Initializes @matcher, allocating necesarry substructures */ @@ -393,90 +692,21 @@ int init_regex_list(struct regex_matcher* matcher) { int rc; - massert(matcher); - matcher->list_inited = 0; - matcher->root_hosts_cnt = 0; - matcher->root_hosts = NULL; - matcher->root_hosts_cnt = 0; - - matcher->root_regex = tree_root_alloc(); - if(!matcher->root_regex) { - return CL_EMEM; - } - - matcher->root_regex_hostonly = tree_root_alloc(); - if(!matcher->root_regex_hostonly) { - free(matcher->root_regex); - return CL_EMEM; - } - - if(( rc = stack_init(&matcher->node_stack) )) { - free(matcher->root_regex_hostonly); - free(matcher->root_regex); - return rc; - } - if(( rc = stack_init(&matcher->node_stack_alt) )) { - free(matcher->root_regex_hostonly); - free(matcher->root_regex); - stack_destroy(&matcher->node_stack); - return rc; - } + assert(matcher); + memset(matcher, 0, sizeof(*matcher)); matcher->list_inited=1; - matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */ + matcher->list_built=0; matcher->list_loaded=0; + hashtab_init(&matcher->suffix_hash, 10); + if((rc = cli_ac_init(&matcher->suffixes, 2, 32))) { + return rc; + } + SO_init(&matcher->filter); return CL_SUCCESS; } -/* inserts @pattern into @root, using ac-matcher - * although the name might be confusing, @pattern is not a regex!*/ -static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info) -{ - int ret; - struct cli_ac_patt *new = cli_calloc(1,sizeof(*new)); - size_t len,i; - - if(!new) - return CL_EMEM; - massert(root); - massert(pattern); - - len = strlen(pattern); - /* need not to match \0 too */ - new->rtype = 0; - new->type = 0; - new->sigid = 0; - new->parts = 0; - new->partno = 0; - new->mindist = 0; - new->maxdist = 0; - new->offset = 0; - new->target = 0; - new->length = len; - new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE; - if(new->length > root->maxpatlen) - root->maxpatlen = new->length; - - new->pattern = cli_malloc(sizeof(new->pattern[0])*len); - if(!new->pattern) { - free(new); - return CL_EMEM; - } - for(i=0;ipattern[i]=pattern[i];/*new->pattern is short int* */ - - - new->virname = cli_strdup(info); - if((ret = cli_ac_addpatt(root,new))) { - free(new->virname); - free(new->pattern); - free(new); - return ret; - } - return CL_SUCCESS; -} - static int functionality_level_check(char* line) { char* ptmin; @@ -527,14 +757,10 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio int rc,line=0; char buffer[FILEBUFF]; - massert(matcher); + assert(matcher); if(matcher->list_inited==-1) return CL_EMALFDB; /* already failed to load */ -/* if(matcher->list_loaded) { - cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n"); - return CL_SUCCESS; - }*/ if(!fd && !dbio) { cli_errmsg("Unable to load regex list (null file)\n"); return CL_EIO; @@ -548,7 +774,6 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio fatal_error(matcher); return rc; } - /*atexit(regex_list_done); TODO: destroy this in manager.c */ } /* * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files: @@ -573,11 +798,13 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio while(cli_dbgets(buffer, FILEBUFF, fd, dbio)) { char* pattern; char* flags; + size_t pattern_len; + cli_chomp(buffer); if(!*buffer) continue;/* skip empty lines */ - if(functionality_level_check(buffer)) + if(functionality_level_check(buffer)) continue; line++; @@ -591,83 +818,39 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio flags = buffer+1; pattern++; - if(is_whitelist) { - const size_t pattern_len = strlen(pattern); - if(pattern_len < FILEBUFF) { - pattern[pattern_len] = '/'; - pattern[pattern_len+1] = '\0'; - } - else { - cli_errmsg("Overlong regex line %d\n",line); - fatal_error(matcher); - return CL_EMALFDB; - } + pattern_len = strlen(pattern); + if(pattern_len < FILEBUFF) { + pattern[pattern_len] = '/'; + pattern[pattern_len+1] = '\0'; + } + else { + cli_errmsg("Overlong regex line %d\n",line); + fatal_error(matcher); + return CL_EMALFDB; } - if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {/*regex*/ - if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags, buffer[0] == 'Y') )) + if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) { + /* regex for hostname*/ + if (( rc = add_pattern(matcher, pattern) )) return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB; } - else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/ - struct cli_matcher* root; - if(matcher->list_built) { - struct cli_matcher* old_hosts = matcher->root_hosts; - matcher->root_hosts_cnt++; - - matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts)); - if(!matcher->root_hosts) { - matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/ - return CL_EMEM; - } - - root = &matcher->root_hosts[matcher->root_hosts_cnt-1]; - memset(root, 0, sizeof(struct cli_matcher)); - - cli_dbgmsg("regex_list: Initialising AC pattern matcher\n"); - if((rc = cli_ac_init(root, cli_ac_mindepth, cli_ac_maxdepth))) { - /* no need to free previously allocated memory here */ - cli_errmsg("regex_list: Can't initialise AC pattern matcher\n"); - return rc; - } - matcher->list_built = 0; - } - else { - root = &matcher->root_hosts[matcher->root_hosts_cnt-1]; - } - if(( rc = add_regex_list_element(root,pattern,flags) )) + else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) { + /*matches displayed host*/ + if (( rc = add_static_pattern(matcher, pattern) )) return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB; } else { return CL_EMALFDB; - /* this is useless, we have host, and regex matches - if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) )) - return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/ } } matcher->list_loaded = 1; - if(( rc = build_regex_list(matcher) )) - return rc; -#ifndef NDEBUG -/* dump_tree(matcher->root_regex);*/ -#endif - if(!matcher->list_built) { - cli_errmsg("Regex list not loaded: build failed!\n"); - fatal_error(matcher); - return CL_EMALFDB; - } - regex_list_cleanup(matcher); return CL_SUCCESS; } -static struct tree_node ** tree_node_get_children(const struct tree_node* node) -{ - return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children; -} - /* Build the matcher list */ -static int build_regex_list(struct regex_matcher* matcher) +int cli_build_regex_list(struct regex_matcher* matcher) { int rc; if(!matcher->list_inited || !matcher->list_loaded) { @@ -675,9 +858,9 @@ static int build_regex_list(struct regex_matcher* matcher) return -1;/*TODO: better error code */ } cli_dbgmsg("Building regex list\n"); - if(matcher->root_hosts) - if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) )) - return rc; + hashtab_free(&matcher->suffix_hash); + if(( rc = cli_ac_buildtrie(&matcher->suffixes) )) + return rc; matcher->list_built=1; return CL_SUCCESS; @@ -686,864 +869,193 @@ static int build_regex_list(struct regex_matcher* matcher) /* Done with this matcher, free resources */ void regex_list_done(struct regex_matcher* matcher) { - massert(matcher); + assert(matcher); - regex_list_cleanup(matcher); if(matcher->list_loaded) { - if(matcher->root_hosts) { - size_t i; - for(i=0;iroot_hosts_cnt;i++) - cli_ac_free(&matcher->root_hosts[i]); - free(matcher->root_hosts); - matcher->root_hosts=NULL; + size_t i; + /* TODO: call it, but be sure it won't free virname */ + //cli_ac_free(&matcher->suffixes); + if(matcher->suffix_regexes) { + for(i=0;isuffix_cnt;i++) { + struct regex_list *r = matcher->suffix_regexes[i]; + while(r) { + cli_regfree(&r->preg); + r = r->nxt; + } + } + free(matcher->suffix_regexes); + matcher->suffix_regexes = NULL; } - - matcher->root_hosts_cnt=0; + hashtab_free(&matcher->suffix_hash); matcher->list_built=0; - destroy_tree(matcher); matcher->list_loaded=0; } if(matcher->list_inited) { matcher->list_inited=0; } - stack_destroy(&matcher->node_stack); - stack_destroy(&matcher->node_stack_alt); -} - -/* Tree matcher algorithm */ -struct token_t -{ - union { - const unsigned char* start; - char_bitmap_p bitmap; - unsigned char c; - } u; - size_t len; - char type; -}; - -enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE}; - -static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token) -{ - massert(pat); - massert(token); - - switch(*pat) { - case '\\': - token->type=TOKEN_CHAR; - token->u.c = *(++pat); - if(islower(token->u.c)) { - /* handle \n, \t, etc. */ - char fmt[3] = {'\\', '\0', '\0'}; - char c; - - fmt[1] = token->u.c; - if(snprintf(&c,1,fmt)!=1) { - token->type=TOKEN_REGEX; - token->u.start = pat; - } - else - token->u.c=c; - } - token->len = 1; - break; - case '|': - token->type=TOKEN_ALT; - break; - case '*': - case '+': - case '?': - case '{': - case '}': - token->type=TOKEN_REGEX; - break; - case '[': - { - /*TODO: implement*/ - /*see if it is something simple like a list of characters, a range, or negated ...*/ - const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/ - unsigned char range_start=0; - int hasprev = 0; - char_bitmap_p bitmap = cli_malloc(32); - if(!bitmap) - return NULL; - if (*pat=='^') { - memset(bitmap,0xFF,32);/*match chars not in brackets*/ - pat++; - } - else - memset(bitmap,0x00,32); - do { - /* literal ] can be first character, so test for it at the end of the loop, for example: []] */ - if (*pat=='-' && hasprev) { - /* it is a range*/ - unsigned char range_end; - unsigned int c; - massert(range_start); - pat++; - if (pat[0]=='[') - if (pat[1]=='.') { - if(pat[2]=='-' && pat[3]=='.' && pat[4]==']') - range_end = '-'; - else { - /* this is getting complicated, bail out */ - cli_warnmsg("confused about collating sequences in regex,bailing out"); - pat=old; - token->type=TOKEN_REGEX; - break; - } - } - else - range_end = *pat; - else - range_end = *pat; - for(c=range_start+1;c<=range_end;c++) - bitmap[c>>3] ^= 1<<(c&0x7); - hasprev = 0; - } - else if (pat[0]=='[' && pat[1]==':') { - const unsigned char* end; - int len,found=-1; - size_t i; - - pat+=2; - end=(unsigned char*)strstr((const char*)pat,":]"); - if(!end) { - cli_warnmsg("confused about std char class syntax regex,bailing out"); - pat=old; - token->type=TOKEN_REGEX; - break; - } - - len = end-pat; - for(i=0;i>3] ^= 1<<(i&0x7); - } - else { - /*unknown class*/ - cli_warnmsg("confused about regex bracket expression, bailing out"); - pat=old; - token->type=TOKEN_REGEX; - break; - } - } - else { - bitmap[*pat>>3] ^= 1<<(*pat&0x7); - pat++; - range_start = *pat; - hasprev = 1; - } - } while(*pat!=']'); - /*TODO: see if this bitmap already exists, then reuse*/ - token->type = TOKEN_BRACKET; - token->u.bitmap = bitmap; - break; - } - case ']': - massert(0 && "Encountered ] without matching ["); - /* bad state */ - break; - case '.': - token->type=TOKEN_DOT; - break; - case '(': - token->type=TOKEN_PAR_OPEN; - break; - case ')': - token->type=TOKEN_PAR_CLOSE; - break; - default: - token->type=TOKEN_CHAR; - token->u.c = *pat; - token->len=1; - break; - } - return ++pat; -} - -#define INITIAL_ALT_STACK 10 -#define ALT_STACK_GROW 20 - -static const unsigned char* find_regex_start(const unsigned char* pat) -{ - struct token_t token; - /*TODO: find where the regex part begins, for ex: - * abcd+, regex begins at 'd' - * */ - const unsigned char* last=NULL; - const unsigned char* tmp=NULL; - const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions)); - size_t altpositions_capacity = INITIAL_ALT_STACK; - size_t altpositions_cnt = 0; - char lasttype = -1; - if(!altpositions) - return NULL; - massert(pat); - - /* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}. - * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to - * back up to the last known good position - * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (. - * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz - * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (? - * */ - do { - tmp = pat; - pat = getNextToken(pat,&token); - if(token.type!=TOKEN_REGEX) { - last = tmp; - lasttype = token.type; - if(token.type==TOKEN_BRACKET && token.u.bitmap) - free(token.u.bitmap); - if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) { - /* save this position on stack, succesfully parsed till here*/ - if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|') - /* encountered another alternate (|) operator, override previous | position stored */ - altpositions[altpositions_cnt-1]=last; - else { - altpositions[altpositions_cnt++] = last; - if(altpositions_cnt == altpositions_capacity) { - altpositions_capacity += ALT_STACK_GROW; - altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions)); - if(!altpositions) - return NULL; - } - } - } else if (lasttype==TOKEN_PAR_CLOSE) { - /* remove last stored position from stack, succesfully this last group */ - altpositions_cnt--; - massert(altpositions_cnt>0); - } - } - else { - if(altpositions_cnt) - last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */ - /*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/ - } - } while(*pat && token.type!=TOKEN_REGEX); - free(altpositions); - return *pat ? last : last+1; -} - -static struct tree_node* tree_node_alloc(struct tree_node* next,char listend) -{ - struct tree_node* node = cli_malloc(sizeof(*node)); - if(node) { - node->alternatives=0; - node->next=next; - node->listend=listend; - node->u.children=NULL; - } - return node; -} - -static struct tree_node* tree_root_alloc(void) -{ - struct tree_node* root=tree_node_alloc(NULL,1); - if(root) { - root->op=OP_ROOT; - root->c=0; - root->next=NULL; - root->listend=1; - } - return root; -} - -static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left) -{ - int right; - struct tree_node **children; - massert(node); - massert(left); - - children = tree_node_get_children(node); - right = node->alternatives-1; - *left = 0; - if(!node->alternatives) - return NULL; - massert(children); - while(*left<=right) { - int mid = *left+(right-*left)/2; - if(children[mid]->c == csearch) - return children[mid]; - else if(children[mid]->c < csearch) - *left=mid+1; - else - right=mid-1; - } - return NULL; -} - -static struct tree_node* tree_get_next(struct tree_node* node) -{ - struct tree_node** children; - massert(node); - children = tree_node_get_children(node); - - if(!node->alternatives && children && children[0]) - return children[0]; - else if(node->alternatives<=1) - return node; - else - return children[0]->next; -} - -static size_t tree_node_get_array_size(const struct tree_node* node) -{ - massert(node); - /* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */ - return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]); -} - -static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left) -{ - struct tree_node* new, *alt = tree_get_next(node); - struct tree_node **children; - node->alternatives++; - node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node)); - if(!node->u.children) - return NULL; - - children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children; - - new = tree_node_alloc(alt , node == alt ); - if(new) { - new->op=OP_CHAR; - new->c=c; - } - - if(node->alternatives-left-1>0) - memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0])); - children[left] = new; - - return new; -} - -static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new) -{ - struct tree_node **children; - massert(node); - massert(new); - - children = tree_node_get_children(node); - if(node->alternatives) { - massert(children); - if(children[0]->next == node) { - int i; - new->listend = 1; - for(i=0;ialternatives;i++) { - children[i]->next = new; - children[i]->listend = 0; - } - } - else { - struct tree_node* p; - for(p = children[0]->next ; p->next != node ; p = p->next) - massert(!p->listend); - new->listend = 1; - p->listend = 0; - p->next = new; - } - } - else { - int idx = node->op==OP_CUSTOMCLASS ? 1 : 0; - if(node->u.children) - if(node->u.children[idx]) { - node = node->u.children[idx]; - while(node->next && !node->listend) - node = node->next; - node->listend = 0; - new->next = node->next; - node->next = new; - new->listend=1; - return; - } - node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2)); - if(node->u.children) { - node->u.children[idx] = new; - } - } -} - -static unsigned char char_getclass(const unsigned char* bitmap) -{ - size_t i; - massert(bitmap); - - for(i=0;i>3)) - return i; - return std_class_cnt; -} - -static void stack_destroy(struct node_stack* stack) -{ - massert(stack); - if(stack->data) - free(stack->data); - stack->data = NULL; - stack->capacity = 0; -} - -/* call this after whitelist load is complete, and the tree is no longer going to be modified */ -void regex_list_cleanup(struct regex_matcher* matcher) -{ - massert(matcher); - - stack_destroy(&matcher->node_stack); - stack_destroy(&matcher->node_stack_alt); - stack_init(&matcher->node_stack); - stack_init(&matcher->node_stack_alt); } int is_regex_ok(struct regex_matcher* matcher) { - massert(matcher); + assert(matcher); return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */ } -/* returns 0 on success, regexec error code otherwise */ -static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly) +static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, char *suffix, size_t len) { - int bol=1; - const unsigned char* pat_end = find_regex_start(pat); - struct token_t token; - struct tree_node* node; - - massert(matcher); + struct cli_matcher *root = &matcher->suffixes; + struct cli_ac_patt *new = cli_calloc(1,sizeof(*new)); + size_t i; + int ret; - node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex; + if(!new) + return CL_EMEM; + assert(root && suffix); - stack_reset(&matcher->node_stack); - stack_reset(&matcher->node_stack_alt); - stack_push(&matcher->node_stack,node); + new->rtype = 0; + new->type = 0; + new->sigid = 0; + new->parts = 0; + new->partno = 0; + new->mindist = 0; + new->maxdist = 0; + new->offset = 0; + new->target = 0; + new->length = len; - for(;node->op!=OP_LEAF;){ - if(patch[0] = new->ch[1] |= CLI_MATCH_IGNORE; + if(new->length > root->maxpatlen) + root->maxpatlen = new->length; - switch(token.type) { - case TOKEN_CHAR: - { - /* search for char in tree */ - int left; - struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left); - if(newnode) - node = newnode; - else { - /* not found, insert it */ - node = tree_node_char_insert(node,token.u.c,left); - } - break; - } + new->pattern = cli_malloc(sizeof(new->pattern[0])*len); + if(!new->pattern) { + free(new); + return CL_EMEM; + } + for(i=0;ipattern[i] = suffix[i];/*new->pattern is short int* */ - case TOKEN_PAR_OPEN: - stack_push(&matcher->node_stack_alt,NULL);/* marker */ - stack_push(&matcher->node_stack,node); - break; + new->virname = (char*)info; + if((ret = cli_ac_addpatt(root,new))) { + free(new->pattern); + free(new); + return ret; + } + SO_preprocess_add(&matcher->filter, suffix, len); + return CL_SUCCESS; +} - case TOKEN_PAR_CLOSE: { - /*TODO: test this!!!*/ - struct tree_node* node_alt = node; - node = tree_node_alloc(NULL,1); - node->op=OP_PARCLOSE; - node->c=0; - node->listend=1; - tree_node_insert_nonbin(node_alt,node); - while (( node_alt = stack_pop(&matcher->node_stack_alt) )) { - tree_node_insert_nonbin(node_alt,node); - } - stack_pop(&matcher->node_stack); - break; - } +#define MODULE "regex_list: " +/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */ - case TOKEN_ALT: - stack_push(&matcher->node_stack_alt,node); - node = stack_pop(&matcher->node_stack); - stack_push(&matcher->node_stack,node); - break; +/* returns 0 on success, clamav error code otherwise */ +static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex) +{ + const struct element *el; - case TOKEN_BRACKET: - { - struct tree_node* new = tree_node_alloc(tree_get_next(node),1); - unsigned char charclass = char_getclass(token.u.bitmap); - if(charclass == std_class_cnt) {/*not a std char class*/ - new->op = OP_CUSTOMCLASS; - new->u.children = cli_malloc(sizeof(new->u.children[0])*2); - if(!new->u.children) - return CL_EMEM; - new->u.bitmap[0] = token.u.bitmap; - new->u.bitmap[1] = NULL; - tree_node_insert_nonbin(node,new); - node = new; - } - else { - new->op = OP_STDCLASS; - new->c = charclass; - tree_node_insert_nonbin(node,new); - node=new; - } - break; - } - - case TOKEN_DOT: - { - struct tree_node* new = tree_node_alloc(tree_get_next(node),1); - new->op = OP_DOT; - tree_node_insert_nonbin(node,new); - node=new; - break; - } - - case TOKEN_REGEX: - case TOKEN_DONE: { - struct leaf_info* leaf=cli_malloc(sizeof(*leaf)); - if(!leaf) - return CL_EMEM; - leaf->info = cli_strdup(info); - if(token.type==TOKEN_REGEX) { - int rc; - struct tree_node* new; - regex_t* preg; - preg=cli_malloc(sizeof(*preg)); - if(!preg) - return CL_EMEM; - rc = cli_regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL)); - leaf->preg=preg; - if(rc) - return rc; - new=cli_malloc(sizeof(*new)); - if(!new) - return CL_EMEM; - new->op=OP_LEAF; - new->next=node; - new->alternatives=0; - new->u.leaf=leaf; - new->listend=1; - tree_node_insert_nonbin(node,new); - } - else { - leaf->preg=NULL; - node->alternatives=0; - node->u.leaf=leaf; - node->op=OP_LEAF; - } - return 0; - } - } - - bol=0; + assert(matcher); + el = hashtab_find(&matcher->suffix_hash, suffix, suffix_len); + /* TODO: what if suffixes are prefixes of eachother and only one will + * match? */ + if(el) { + /* existing suffix */ + assert(el->data < matcher->suffix_cnt); + regex->nxt = matcher->suffix_regexes[el->data]; + matcher->suffix_regexes[el->data] = regex; + cli_dbgmsg(MODULE "added new regex to existing suffix %s: %s\n", suffix, regex->pattern); + } else { + /* new suffix */ + size_t n = matcher->suffix_cnt++; + el = hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n); + matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes)); + if(!matcher->suffix_regexes) + return CL_EMEM; + matcher->suffix_regexes[n] = regex; + add_newsuffix(matcher, regex, suffix, suffix_len); + cli_dbgmsg(MODULE "added new suffix %s, for regex: %s\n", suffix, regex->pattern); } return 0; } -/* c has to be unsigned char here!! */ -static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info) -{ - struct tree_node** children; - int rc; - - massert(node); - massert(c); - massert(info); - - if(!node->u.children) - return MATCH_FAILED;/* tree empty */ - *info = NULL; - len++; - c--; - for(;;) { - massert(node); - children = node->u.children; - switch(node->op) { - case OP_ROOT: - rc=1; - break; - case OP_PARCLOSE: - /*this isn't a real character, so don't move*/ - c--; - len++; - rc=1; - break; - case OP_CHAR: - massert(*c==node->c && "We know this has to match"); - rc = 1;/* *c==node->c;- we know it has matched */ - break; - case OP_DOT: - rc = *c!='\n'; - break; - case OP_STDCLASS: - rc = char_class[*c]&(node->c); - break; - case OP_CUSTOMCLASS: - { - char_bitmap_p bitmap; - massert(children); - bitmap = (char_bitmap_p)node->u.bitmap[0]; - children++; - rc = bitmap[*c>>3]&(1<<(*c&0x7)); - break; - } - case OP_LEAF: - { - const struct leaf_info* leaf = node->u.leaf; - /*isleaf = 1;*/ - if(leaf->preg) { - rc = !cli_regexec(leaf->preg,(const char*)c,0,NULL,0); - } - else { - massert(*c==node->c && "We know this has to match[2]"); - rc = 1; - } - if(rc) { - *info = leaf->info; - return MATCH_SUCCESS; - } - break; - } - default: - /* impossible */ - cli_errmsg("Encountered invalid operator in tree:%d\n",node->op); - exit(1); - } - len--; - if(!len) rc=0; - c++; - if(rc) { - const char csearch = *c; - int left = 0,right = node->alternatives-1; - int mid; - /*matched so far, go deeper*/ - /*do a binary search between children */ - massert(children); - while(left<=right) { - mid = left+(right-left)/2; - if (children[mid]->c == csearch) - break; - else if(children[mid]->c < csearch) - left=mid+1; - else - right=mid-1; - } - if(left<=right) { - node = children[mid]; - massert(node); - } - else { - if(node->alternatives) { - if(!children[0]->listend) { - node = children[0]; - c++; - len--; - } - while(node && node->listend) { - node = node->next;/* climb up */ - c--; - len++; - } - if(!node || !node->next) - return MATCH_FAILED;/* reached root node */ - node=node->next; - c--; - len++; - } - else if(node->u.children) { - struct tree_node* rewrite_next = NULL; - if(node->op==OP_PARCLOSE) - rewrite_next = node; - node = children[0]; - massert(node); - massert(node->op!=OP_CHAR); - if(rewrite_next) - node->next = rewrite_next;/* this node is pointed to by several parent nodes, - we need to know - from which one we came, so we can find out way back - should we fail to match somewhere deeper*/ - } - } - } - else { - /* this node didn't match, try sibling, or parent (if no more siblings) */ - while(node && node->listend) { - node = node->next;/* sibling of parent */ - c--; - len++; - } - if(!node || !node->next) /* reached root node, it has no next */ - return MATCH_FAILED; - else { - c--; - len++; - node=node->next; - } - } - } - return MATCH_FAILED; -} - -/* push node on stack, only if it isn't there already */ -static void stack_push_once(struct node_stack* stack,struct tree_node* node) +static size_t reverse_string(char *pattern) { + size_t len = strlen(pattern); size_t i; - massert(stack); - massert(node); - - for(i=0;i < stack->cnt;i++) - if(stack->data[i]==node) - return; - stack_push(stack,node); + for(i=0; i < (len/2); i++) { + char aux = pattern[i]; + pattern[i] = pattern[len-i-1]; + pattern[len-i-1] = aux; + } + return len; } -static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node) +static int add_static_pattern(struct regex_matcher *matcher, char* pattern) { - struct tree_node **children; - massert(matcher); - massert(node); - - children = tree_node_get_children(node); - if(node->op==OP_LEAF) { - struct leaf_info* leaf = node->u.leaf; - if(node->next && !node->listend) - destroy_tree_internal(matcher,node->next); - stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */ - stack_push_once(&matcher->node_stack,node); - if(leaf->preg) { - cli_regfree(leaf->preg); - free(leaf->preg); - leaf->preg=NULL; - } - if(leaf->info) { - free(leaf->info); - leaf->info=NULL; - } - /* return;*/ - } - if(node->alternatives) { - int i; - struct tree_node* p; - massert(children); - p = children[0]->op==OP_LEAF ? NULL : children[0]->next; - for(i=0;ialternatives;i++) - destroy_tree_internal(matcher,children[i]); - if(p && p!=node) - destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/ - } - else { - if(children) { - if(children[0]) - destroy_tree_internal(matcher,children[0]); - } - } - if(node->op!=OP_LEAF && node->next && !node->listend) - destroy_tree_internal(matcher,node->next); - if(node->u.children) - stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */ - if(node->op==OP_CUSTOMCLASS && node->u.children[0]) { - free(node->u.children[0]); - node->u.children[0]=NULL; - } - stack_push_once(&matcher->node_stack,node); + size_t len; + struct regex_list *regex = cli_malloc(sizeof(*regex)); + if(!regex) + return CL_EMEM; + len = reverse_string(pattern); + regex->nxt = NULL; + regex->pattern = cli_strdup(pattern); + regex->preg.re_magic = 0; + return add_pattern_suffix(matcher, pattern, len, regex); } -static void destroy_tree(struct regex_matcher* matcher) +static int add_pattern(struct regex_matcher *matcher, char *pattern) { - /* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations, - * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack, - * (and push to stack only if it doesn't contain it already*/ - massert(matcher); + struct text_buffer buf; + struct node *n; + size_t last=0; + int rc; + struct regex_list *regex = cli_malloc(sizeof(*regex)); + struct node root_node; + size_t len; + /* we only match the host, so remove useless stuff */ + const char remove_end[] = "([/?].*)?/"; + const char remove_end2[] = "([/?].*)/"; - stack_reset(&matcher->node_stack); - destroy_tree_internal(matcher,matcher->root_regex); - destroy_tree_internal(matcher,matcher->root_regex_hostonly); - while (matcher->node_stack.cnt) { - struct tree_node* node = stack_pop(&matcher->node_stack); - if(node) - free(node); - } -} -#ifndef NDEBUG -static void dump_node(struct tree_node* node) -{ - int i; - struct tree_node* p,**children; - massert(node); - if(node->op==OP_LEAF) { - if(node->u.leaf->preg) - printf("n%p [label=\"regex\\nleaf\"]",(void*)node); - else - printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c); - if(node->next && !node->listend) { - printf("n%p -> n%p;\n",(void*)node,(void*)node->next); - dump_node(node->next); - } - return; - } - printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend); - if(node->next) - printf("n%p -> n%p;\n",(void*)node,(void*)node->next); - printf("n%p -> {",(void*)node);/*using address of node as id*/ - children = tree_node_get_children(node); - if(node->alternatives) - massert(children); - for(i=0;ialternatives;i++) - printf("n%p ",(void*)children[i]); - if(node->alternatives && children[0]->op!=OP_LEAF) - for(p=children[0]->next;p!=node;p=p->next) - { - massert(p); - printf("n%p ",(void*)p); - if(p->op==OP_LEAF || p->listend) - break; - } - if(!node->alternatives && children && children[0]) - printf("n%p ",(void*)children[0]); - printf("};\n"); - printf("{rank=same;"); - for(i=0;ialternatives;i++) - printf("n%p ",(void*)node->u.children[i]); - if(node->alternatives && children[0]->op!=OP_LEAF) - for(p=children[0]->next;p!=node;p=p->next) - { - printf("n%p ",(void*)p); - if(p->op==OP_LEAF || p->listend) - break; - } - if(!node->alternatives && children && children[0]) - printf("n%p ",(void*)children[0]); - printf("};\n"); - for(i=0;ialternatives;i++) - dump_node(children[i]); - if(node->alternatives && children[0]->op!=OP_LEAF) - for(p=children[0]->next;p!=node;p=p->next) - { - dump_node(p); - if(p->op==OP_LEAF || p->listend) - break; - } - if(!node->alternatives && children && children[0]) - dump_node(children[0]); -} -void dump_tree(struct tree_node* root) -{ - /*use dot/dotty from graphviz to view it*/ - massert(root); - printf("digraph tree {\n"); - dump_node(root); - printf("}\n"); + if(!regex) + return CL_EMEM; + + len = strlen(pattern); + if(len > sizeof(remove_end)) { + if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) { + len -= sizeof(remove_end) - 1; + } + if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) { + len -= sizeof(remove_end2) - 1; + } + } + pattern[len] = '\0'; + + + rc = cli_regcomp(®ex->preg, pattern, REG_EXTENDED); + if(rc) { + size_t buflen = cli_regerror(rc, ®ex->preg, NULL, 0); + char *errbuf = cli_malloc(buflen); + if(errbuf) { + cli_regerror(rc, ®ex->preg, errbuf, buflen); + cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf); + free(errbuf); + } else { + cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern); + } + return rc; + cli_regfree(®ex->preg); + free(regex); + return CL_EMALFDB; + } + regex->pattern = cli_strdup(pattern); + regex->nxt = NULL; + + n = parse_regex(pattern, &last); + memset(&buf, 0, sizeof(buf)); + memset(&root_node, 0, sizeof(buf)); + n->parent = &root_node; + + rc = build_suffixtree_descend(matcher, regex, n, &buf); + destroy_tree(n); + return rc; } -#endif diff --git a/libclamav/regex_list.h b/libclamav/regex_list.h index 90a2ef132..4103bf446 100644 --- a/libclamav/regex_list.h +++ b/libclamav/regex_list.h @@ -24,39 +24,37 @@ #ifndef _REGEX_LIST_H #define _REGEX_LIST_H -#ifdef NDEBUG -#define massert(x) (void)(0) -#else -/*debug version, massert enabled*/ - -#define __massert_fail(expr,file,line) (void)cli_errmsg("Assertion failed at %s:%d\n %s\n",file,line,expr) - -#define massert(expr) ((void) ((expr) ? (void)0 : (__massert_fail (#expr,__FILE__,__LINE__)))) -#endif - #include "phishcheck.h" #include "readdb.h" #include "matcher.h" #include /* for gzFile */ -struct node_stack { - struct tree_node** data; - size_t capacity; - size_t cnt; + +struct regex_list { + const char *pattern; + regex_t preg; + struct regex_list *nxt; +}; + +struct filter { + uint32_t B[65536]; + uint32_t end_fast[256]; + uint32_t end[65536]; + unsigned long m; }; struct regex_matcher { - struct cli_matcher* root_hosts; - struct tree_node* root_regex; - struct tree_node* root_regex_hostonly; - struct node_stack node_stack; - struct node_stack node_stack_alt; - size_t root_hosts_cnt; - int list_inited; - int list_loaded; - int list_built; + struct hashtable suffix_hash; + size_t suffix_cnt; + struct regex_list **suffix_regexes; + struct cli_matcher suffixes; + struct filter filter; + int list_inited:2; + int list_loaded:2; + int list_built:2; }; -int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char** info,int is_whitelist); +int cli_build_regex_list(struct regex_matcher* matcher); +int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char **info, int is_whitelist); int init_regex_list(struct regex_matcher* matcher); int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist,struct cli_dbio *dbio); void regex_list_cleanup(struct regex_matcher* matcher);