performance improvements for URL matching (bb #725, bb #650):

* use a suffix AC-trie and a shift-or FSM to filter
* rewrite the URL regex in C
* use a perfect hash to lookup TLD and ccTLD, instead of a regex
* TODO: suffixes having a common prefix: loop over all of them
	cli_ac_free: multiple virname pointing to same location


git-svn: trunk@3978
This commit is contained in:
Török Edvin
2008-07-23 13:51:57 +00:00
parent f8a82180da
commit 2e11bcdfd9
16 changed files with 2222 additions and 1436 deletions

View File

@@ -1,3 +1,12 @@
Wed Jul 23 16:32:32 EEST 2008 (edwin)
------------------------------------
* libclamav: performance improvements for URL matching (bb #725, bb #650):
* use a suffix AC-trie and a shift-or FSM to filter
* rewrite the URL regex in C
* use a perfect hash to lookup TLD and ccTLD, instead of a regex
* TODO: suffixes having a common prefix: loop over all of them
cli_ac_free: multiple virname pointing to same location
Mon Jul 21 12:16:44 CEST 2008 (tk)
----------------------------------
* sigtool/vba.c: fix crash on error in vba code (bb#1106)

View File

@@ -1,7 +1,7 @@
PERL=perl
CC=cc
all: entitylist.h encoding_aliases.h gentbl encname_chars.h
all: entitylist.h encoding_aliases.h gentbl encname_chars.h generate_hash
entities_parsed: entities entities/* entity_decl_parse.pl
$(PERL) entity_decl_parse.pl $</* | sort -u >$@
@@ -9,6 +9,9 @@ entities_parsed: entities entities/* entity_decl_parse.pl
generate_entitylist: generate_entitylist.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c
$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@
generate_hash: generate_hash.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c
$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@
generate_encoding_aliases: generate_encoding_aliases.c ../../libclamav/hashtab.c ../../libclamav/others.c ../../libclamav/htmlnorm.h ../../libclamav/entconv.h ../../libclamav/cltypes.h ../../libclamav/hashtab.h ../../libclamav/hashtab.h
$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@

View File

@@ -26,30 +26,11 @@ OUTFILE=iana_tld.h
echo "Downloading updated tld list from iana.org"
wget $IANA_TLD -O $TMP || exit 2
echo "Download complete, parsing data"
# 174 is the code for |
TLDLIST=$(egrep -v ^# $TMP | tr \\n \\174 | sed 's/[^a-zA-Z]$//')
echo "Parse complete, removing tmpfile"
rm $TMP
echo "Generating tld list in $OUTFILE"
cat >$OUTFILE <<EOF
#ifndef IANA_TLD_H
#define IANA_TLD_H
EOF
echo -n "#define iana_tld \"(" >>$OUTFILE
echo -n $TLDLIST >>$OUTFILE
echo ")\"" >>$OUTFILE
grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_tld.h
echo "Downloading updated country-code list from iana.org"
wget $IANA_CCTLD -O $TMP || exit 2
echo "Download complete, parsing data"
CCTLDLIST=$(cat $TMP | egrep -oi "<a href=[^>]+>\\.([a-zA-Z]+).+</a>" | egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr \\n \\174 | sed 's/[^a-zA-Z]$//')
echo "Parse complete, removing tmpfile"
rm $TMP
echo "Generating cctld list in $OUTFILE"
echo -n "#define iana_cctld \"(" >>$OUTFILE
echo -n $CCTLDLIST >>$OUTFILE
echo ")\"" >>$OUTFILE
echo "#endif" >>$OUTFILE
echo "Finished succesfully"
cat $TMP | grep country-code|egrep -oi "<a
href=[^>]+>\\.([a-zA-Z]+).+</a>"|egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr [A-Z] [a-z]| gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set |grep -v '^#line'|sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_cctld.h
echo "Done"

View File

@@ -26,17 +26,4 @@ echo "Downloading updated tld list from iana.org"
wget $IANA_TLD -O $TMP || exit 2
echo "Download complete, parsing data"
# 174 is the code for |
TLDLIST=$(egrep -v ^# $TMP|tr \\n \\174 )
echo "Parse complete, removing tmpfile"
rm $TMP
echo "Generating $OUTFILE"
cat >$OUTFILE <<EOF
#ifndef IANA_TLD_H
#define IANA_TLD_H
EOF
echo -n "#define iana_tld \"(" >>$OUTFILE
echo -n $TLDLIST >>$OUTFILE
echo ")\"" >>$OUTFILE
echo "#endif" >>$OUTFILE
echo "Finished succesfully"
grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -H tld_hash -N in_tld_set -l|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g'

View File

@@ -361,7 +361,7 @@ All 4 tests passed
\item The exact output from \verb+make check+
\item Output of \verb+uname -mrsp+
\item your \verb+config.log+
\item The following files from the \verb+unit-tests/+ directory:
\item The following files from the \verb+unit_tests/+ directory:
\begin{itemize}
\item \verb+test.log+
\item \verb+clamscan.log+

View File

@@ -367,10 +367,18 @@ void hashtab_clear(struct hashtable *s)
if(s->htable[i].key && s->htable[i].key != DELETED_KEY)
free((void *)s->htable[i].key);
}
memset(s->htable, 0, s->capacity);
if(s->htable)
memset(s->htable, 0, s->capacity);
s->used = 0;
}
void hashtab_free(struct hashtable *s)
{
hashtab_clear(s);
free(s->htable);
s->htable = NULL;
s->capacity = 0;
}
int hashtab_store(const struct hashtable *s,FILE* out)
{

View File

@@ -82,7 +82,7 @@ int hashtab_init(struct hashtable *s,size_t capacity);
const struct element* hashtab_insert(struct hashtable *s, const char* key, const size_t len, const element_data data);
void hashtab_delete(struct hashtable *s,const char* key,const size_t len);
void hashtab_clear(struct hashtable *s);
void hashtab_free(struct hashtable *s);
int hashtab_load(FILE* in, struct hashtable *s);
int hashtab_store(const struct hashtable *s,FILE* out);

505
libclamav/iana_cctld.h Normal file
View File

@@ -0,0 +1,505 @@
/* ANSI-C code produced by gperf version 3.0.3 */
/* Command-line: gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set */
/* Computed positions: -k'1-2' */
#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
&& ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
&& ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
&& ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
&& ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
&& ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
&& ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
&& ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
&& ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646. */
#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>."
#endif
/* maximum key range = 472, duplicates = 0 */
#ifdef __GNUC__
__inline
#else
#ifdef __cplusplus
inline
#endif
#endif
static unsigned int
cctld_hash (const char *str, unsigned int len)
{
static const unsigned short asso_values[] =
{
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 119, 97, 33,
103, 4, 59, 115, 210, 149, 169, 143, 175, 55,
145, 89, 178, 37, 85, 18, 34, 239, 2, 73,
112, 3, 25, 10, 15, 117, 209, 229, 150, 223,
200, 78, 225, 54, 5, 215, 215, 190, 25, 23,
0, 20, 233, 234, 14, 476, 33, 204, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
476
};
return len + asso_values[(unsigned char)str[1]] + asso_values[(unsigned char)str[0]+25];
}
#ifdef __GNUC__
__inline
#ifdef __GNUC_STDC_INLINE__
__attribute__ ((__gnu_inline__))
#endif
#endif
const char *
in_cctld_set (const char *str, unsigned int len)
{
enum
{
TOTAL_KEYWORDS = 252,
MIN_WORD_LENGTH = 2,
MAX_WORD_LENGTH = 2,
MIN_HASH_VALUE = 4,
MAX_HASH_VALUE = 475
};
static const unsigned char lengthtable[] =
{
0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0,
2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 2,
0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2,
2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2,
0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0,
2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2,
2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2,
2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2,
2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2,
0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2,
0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2,
0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2,
2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2,
2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0,
2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0,
0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2,
2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
2, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2,
2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0,
0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2,
0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0,
0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2,
2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0,
2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0,
0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2,
2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2,
2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
};
static const char * const wordlist[] =
{
"", "", "", "",
"sv",
"sy",
"se",
"", "",
"mv",
"my",
"me",
"", "",
"bv",
"by",
"be",
"", "",
"cv",
"cy",
"", "", "",
"tv",
"ms",
"",
"sz",
"",
"re",
"bs",
"ae",
"mz",
"",
"ws",
"sc",
"st",
"bz",
"",
"ye",
"mc",
"mt",
"cz",
"rs",
"mq",
"as",
"bt",
"tz",
"", "",
"cc",
"",
"az",
"", "",
"tc",
"tt",
"sm",
"lv",
"ly",
"ac",
"at",
"mm",
"",
"aq",
"",
"mf",
"bm",
"",
"yt",
"",
"bf",
"cm",
"",
"ls",
"wf",
"cf",
"tm",
"", "",
"mw",
"tf",
"am",
"",
"je",
"bw",
"af",
"sr",
"",
"lc",
"lt",
"so",
"mr",
"", "",
"tw",
"mo",
"br",
"rw",
"sb",
"aw",
"bo",
"cr",
"", "",
"sd",
"co",
"tr",
"",
"bb",
"md",
"to",
"ar",
"",
"ro",
"bd",
"ao",
"sg",
"",
"mx",
"cd",
"sa",
"mg",
"de",
"",
"td",
"ma",
"bg",
"",
"cx",
"ad",
"ba",
"cg",
"", "",
"jm",
"ca",
"tg",
"",
"ax",
"",
"lr",
"ag",
"",
"dz",
"sk",
"qa",
"sn",
"", "",
"mk",
"si",
"mn",
"lb",
"",
"gy",
"ge",
"bn",
"", "",
"ck",
"bi",
"cn",
"", "",
"tk",
"ci",
"tn",
"",
"jo",
"gs",
"sj",
"an",
"",
"dm",
"la",
"ai",
"sl",
"", "", "",
"bj",
"ml",
"", "",
"mp",
"gt",
"bl",
"",
"gq",
"",
"tj",
"cl",
"", "",
"py",
"pe",
"tl",
"",
"lk",
"tp",
"",
"al",
"", "",
"li",
"ie",
"gm",
"do",
"",
"ps",
"gf",
"sh",
"", "",
"ee",
"",
"mh",
"", "",
"is",
"ne",
"bh",
"", "",
"gw",
"pt",
"ch",
"",
"es",
"ky",
"ke",
"th",
"", "", "",
"it",
"gr",
"uy",
"iq",
"ve",
"su",
"nz",
"",
"ec",
"et",
"mu",
"pm",
"",
"gb",
"nc",
"pf",
"kz",
"us",
"",
"gd",
"cu",
"im",
"jp",
"ht",
"uz",
"zm",
"dk",
"",
"ru",
"pw",
"au",
"gg",
"",
"vc",
"",
"ga",
"om",
"",
"yu",
"",
"nf",
"pr",
"",
"zw",
"hm",
"",
"km",
"", "", "",
"fm",
"ir",
"dj",
"",
"um",
"io",
"", "", "",
"lu",
"er",
"gn",
"", "",
"kw",
"gi",
"nr",
"", "",
"id",
"no",
"pg",
"", "",
"hr",
"pa",
"kr",
"", "", "",
"fr",
"", "", "",
"fo",
"", "", "", "",
"za",
"eg",
"gl",
"", "",
"gp",
"",
"ng",
"", "",
"pk",
"na",
"pn",
"", "", "", "",
"kg",
"", "", "", "",
"in",
"", "",
"ug",
"vg",
"", "",
"ua",
"va",
"", "", "", "", "", "",
"gh",
"", "", "",
"ni",
"pl",
"hk",
"",
"hn",
"",
"kn",
"",
"fk",
"",
"ki",
"il",
"uk",
"",
"fi",
"vn",
"", "", "",
"vi",
"", "", "", "", "",
"gu",
"nl",
"", "",
"np",
"", "", "", "",
"fj",
"",
"ph",
"", "",
"kp",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "",
"eh",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "",
"kh",
"", "", "", "", "", "", "", "", "",
"", "", "",
"eu",
"", "", "", "", "",
"nu",
"", "", "", "", "", "", "",
"hu",
"", "", "", "", "", "", "", "", "",
"",
"vu"
};
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
{
int key = cctld_hash (str, len);
if (key <= MAX_HASH_VALUE && key >= 0)
if (len == lengthtable[key])
{
const char *s = wordlist[key];
if (*str == *s && !memcmp (str + 1, s + 1, len - 1))
return s;
}
}
return 0;
}

View File

@@ -1,28 +1,746 @@
/*
* Phishing module: iana tld list.
*
* Copyright (C) 2007-2008 Sourcefire, Inc.
*
* Authors: Török Edvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
/* ANSI-C code produced by gperf version 3.0.3 */
/* Command-line: gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set */
/* Computed positions: -k'1-2,6' */
#ifndef IANA_TLD_H
#define IANA_TLD_H
#define iana_tld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|AERO|ARPA|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM|TRAVEL|XN--ZCKZAH|XN--0ZWM56D|XN--DEBA0AD|XN--G6W251D|XN--JXALPDLP|XN--KGBECHTV|XN--9T4B11YI5A|XN--80AKHBYKNJ4F|XN--11B5BS3A9AJ6G|XN--HGBK6AJ7F53BBA)"
#define iana_cctld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJLMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGHRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEFGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|IN[TT]|MIL|NET|ORG|PRO|TEL|AERO|ARP[AA]|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM)"
#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
&& ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
&& ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
&& ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
&& ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
&& ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
&& ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
&& ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
&& ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646. */
#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>."
#endif
/* maximum key range = 983, duplicates = 0 */
#ifdef __GNUC__
__inline
#else
#ifdef __cplusplus
inline
#endif
#endif
static unsigned int
tld_hash (const char *str, unsigned int len)
{
static const unsigned short asso_values[] =
{
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 0, 15,
988, 988, 988, 988, 0, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 170, 328, 88,
3, 50, 293, 205, 123, 430, 500, 238, 115, 320,
375, 30, 413, 348, 70, 43, 475, 18, 6, 283,
95, 58, 10, 220, 5, 485, 480, 8, 190, 390,
225, 113, 420, 95, 0, 15, 50, 295, 20, 128,
130, 80, 405, 470, 340, 0, 305, 415, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
988
};
int hval = len;
switch (hval)
{
default:
hval += asso_values[(unsigned char)str[5]];
/*FALLTHROUGH*/
case 5:
case 4:
case 3:
case 2:
hval += asso_values[(unsigned char)str[1]];
/*FALLTHROUGH*/
case 1:
hval += asso_values[(unsigned char)str[0]+25];
break;
}
return hval;
}
#ifdef __GNUC__
__inline
#ifdef __GNUC_STDC_INLINE__
__attribute__ ((__gnu_inline__))
#endif
#endif
const char *
in_tld_set (const char *str, unsigned int len)
{
enum
{
TOTAL_KEYWORDS = 280,
MIN_WORD_LENGTH = 2,
MAX_WORD_LENGTH = 18,
MIN_HASH_VALUE = 5,
MAX_HASH_VALUE = 987
};
static const unsigned char lengthtable[] =
{
0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2,
0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2,
0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 3, 4, 2, 0,
2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2,
0, 4, 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 3, 0,
0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0,
4, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 0, 2,
0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 0, 0,
2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 0, 2,
0, 2, 0, 0, 2, 0, 2, 3, 0, 2, 0, 0, 2, 0,
2, 0, 2, 0, 0, 2, 0, 4, 2, 0, 2, 0, 2, 0,
0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2,
0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 3, 0, 2, 0,
2, 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 0, 0, 2,
0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0,
2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 3,
0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 0,
0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0,
2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2,
0, 2, 0, 0, 2, 6, 2, 0, 0, 0, 0, 2, 0, 0,
2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0,
0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2,
0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0,
2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2,
0, 2, 0, 0, 2, 0, 2, 0, 6, 2, 0, 2, 0, 0,
2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0,
0, 2, 0, 2, 3, 0, 2, 0, 2, 0, 0, 2, 0, 2,
0, 0, 0, 0, 2, 0, 0, 2, 11, 2, 0, 0, 0, 16,
2, 0, 0, 0, 11, 2, 0, 0, 0, 0, 2, 0, 0, 0,
0, 17, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0,
2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 3,
0, 2, 11, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2,
0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0,
2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 10, 0, 2,
0, 2, 0, 0, 2, 0, 12, 0, 0, 2, 3, 2, 0, 0,
2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0,
0, 2, 0, 2, 18, 0, 2, 0, 2, 0, 0, 2, 0, 2,
0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0,
2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2,
0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0,
2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0,
0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2,
0, 0, 2, 0, 12, 0, 0, 0, 0, 2, 18, 0, 0, 0,
2, 3, 4, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0,
2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0,
0, 2, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2
};
static const char * const wordlist[] =
{
"", "", "", "", "",
"md",
"", "",
"mv",
"",
"cd",
"",
"mz",
"cv",
"",
"ad",
"",
"cz",
"", "",
"mu",
"",
"az",
"", "",
"cu",
"",
"nz",
"", "",
"au",
"",
"mo",
"",
"mobi",
"nu",
"",
"co",
"com",
"coop",
"fo",
"",
"ao",
"", "",
"ms",
"",
"no",
"", "", "", "",
"me",
"", "",
"as",
"",
"asia",
"", "",
"my",
"",
"ae",
"",
"aero",
"cy",
"",
"ne",
"net",
"", "", "",
"mr",
"", "", "", "",
"cr",
"", "",
"fr",
"",
"ar",
"",
"arpa",
"td",
"",
"nr",
"tv",
"",
"mc",
"",
"tz",
"", "",
"cc",
"",
"mx",
"", "",
"ac",
"",
"cx",
"lv",
"",
"nc",
"",
"ax",
"", "", "", "",
"to",
"", "",
"lu",
"",
"ml",
"", "", "", "",
"cl",
"org",
"",
"mh",
"",
"al",
"", "",
"ch",
"",
"nl",
"tel",
"",
"sd",
"", "",
"sv",
"",
"ls",
"",
"sz",
"", "",
"jo",
"",
"jobs",
"ru",
"",
"su",
"",
"tr",
"", "",
"ly",
"", "", "", "",
"ro",
"",
"so",
"", "",
"je",
"",
"lr",
"", "",
"tc",
"",
"ma",
"rs",
"", "", "",
"ca",
"cat",
"",
"re",
"",
"se",
"", "",
"lc",
"",
"na",
"",
"name",
"sy",
"",
"qa",
"", "",
"gd",
"",
"tl",
"", "", "", "",
"sr",
"", "",
"th",
"",
"mg",
"", "",
"gu",
"",
"cg",
"", "", "", "",
"ag",
"", "",
"sc",
"",
"ng",
"gov",
"",
"bd",
"", "",
"bv",
"",
"id",
"",
"bz",
"", "",
"gs",
"", "", "", "",
"mk",
"",
"ge",
"", "",
"ck",
"",
"sl",
"fk",
"",
"gy",
"",
"bo",
"", "",
"sh",
"",
"io",
"", "", "", "",
"gr",
"", "",
"bs",
"",
"la",
"", "",
"is",
"travel",
"be",
"", "", "", "",
"ie",
"", "",
"by",
"", "", "", "",
"mw",
"",
"tg",
"", "", "", "",
"br",
"", "",
"aw",
"",
"ir",
"", "",
"cf",
"",
"sa",
"", "",
"af",
"",
"gl",
"", "",
"nf",
"", "", "", "",
"gh",
"", "", "", "",
"tk",
"",
"mm",
"", "",
"yu",
"",
"cm",
"", "",
"fm",
"",
"am",
"", "",
"lk",
"",
"sg",
"", "",
"ps",
"",
"il",
"",
"museum",
"bh",
"",
"pe",
"", "",
"mq",
"", "", "", "",
"py",
"",
"ye",
"", "",
"aq",
"",
"ga",
"", "",
"tw",
"",
"pr",
"pro",
"",
"sk",
"",
"om",
"", "",
"tf",
"",
"mn",
"", "", "", "",
"cn",
"", "",
"ws",
"xn--g6w251d",
"an",
"", "", "",
"xn--80akhbyknj4f",
"ba",
"", "", "",
"xn--0zwm56d",
"gg",
"", "", "", "",
"tm",
"", "", "", "",
"xn--11b5bs3a9aj6g",
"", "",
"hu",
"",
"pl",
"rw",
"",
"mp",
"",
"uz",
"", "",
"ph",
"", "", "", "",
"lb",
"",
"bg",
"", "",
"np",
"",
"kz",
"mil",
"",
"jm",
"xn--deba0ad",
"ci",
"", "",
"fi",
"",
"ai",
"", "", "", "",
"ni",
"", "",
"us",
"",
"sm",
"", "", "", "",
"tn",
"", "",
"sb",
"",
"hr",
"", "",
"uy",
"",
"pa",
"", "", "", "",
"ke",
"xn--zckzah",
"",
"gw",
"",
"mt",
"", "",
"ky",
"",
"xn--jxalpdlp",
"", "",
"gf",
"edu",
"at",
"", "",
"vu",
"",
"kr",
"", "",
"tp",
"",
"dz",
"", "",
"eu",
"",
"pg",
"", "",
"bw",
"",
"sn",
"xn--hlcj6aya9esc7a",
"",
"fj",
"",
"gm",
"", "",
"bf",
"",
"do",
"", "",
"gb",
"",
"ve",
"", "",
"es",
"",
"li",
"jp",
"", "", "",
"ee",
"", "",
"pk",
"",
"de",
"", "",
"gq",
"",
"bm",
"", "",
"kh",
"",
"im",
"", "",
"bb",
"",
"er",
"", "", "", "",
"tt",
"", "",
"vc",
"",
"si",
"", "", "", "",
"gn",
"", "",
"ec",
"",
"lt",
"", "",
"iq",
"",
"ua",
"", "",
"pw",
"",
"tj",
"", "", "", "",
"za",
"", "",
"pf",
"",
"xn--kgbechtv",
"", "", "", "",
"bn",
"xn--hgbk6aj7f53bba",
"", "", "",
"in",
"int",
"info",
"gp",
"",
"st",
"", "", "", "",
"ug",
"", "", "", "",
"pm",
"", "", "", "",
"gi",
"", "", "", "",
"kg",
"", "",
"hk",
"",
"sj",
"", "",
"wf",
"", "", "", "", "", "",
"va",
"", "",
"uk",
"", "", "", "", "", "",
"bi",
"biz",
"", "", "", "", "", "", "", "", "",
"", "", "", "",
"gt",
"", "", "", "",
"pn",
"", "", "", "",
"vg",
"", "", "", "", "", "", "", "", "",
"eg",
"", "", "", "", "", "", "", "", "",
"bt",
"", "",
"zw",
"",
"it",
"", "",
"kw",
"", "", "", "", "", "",
"hm",
"", "", "", "", "", "", "", "", "",
"bj",
"", "",
"dk",
"", "", "", "", "", "", "", "", "",
"", "",
"zm",
"", "", "", "",
"km",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "",
"hn",
"", "", "", "",
"pt",
"", "", "", "", "", "", "", "", "",
"yt",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "",
"kn",
"", "", "", "", "", "", "", "", "",
"dm",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"kp",
"", "", "", "", "", "", "", "", "",
"", "",
"vn",
"", "", "", "",
"ki",
"", "", "", "", "", "", "", "", "",
"", "",
"xn--9t4b11yi5a",
"", "",
"ht",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "",
"vi",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"et",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "",
"", "",
"dj"
};
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
{
int key = tld_hash (str, len);
if (key <= MAX_HASH_VALUE && key >= 0)
if (len == lengthtable[key])
{
const char *s = wordlist[key];
if (*str == *s && !memcmp (str + 1, s + 1, len - 1))
return s;
}
}
return 0;
}

View File

@@ -49,16 +49,6 @@ int domainlist_match(const struct cl_engine* engine,char* real_url,const char* d
{
const char* info;
int rc = engine->domainlist_matcher ? regex_list_match(engine->domainlist_matcher,real_url,display_url,hostOnly ? pre_fixup : NULL,hostOnly,&info,0) : 0;
if(rc && info && info[0] && info[0] != ':') {/*match successful, and has custom flags*/
if(strlen(info)==3 && isxdigit(info[0]) && isxdigit(info[1]) && isxdigit(info[2])) {
unsigned short notwantedflags=0;
sscanf(info,"%hx",&notwantedflags);
*flags &= ~notwantedflags;/* filter unwanted phishcheck flags */
}
else {
cli_warnmsg("Phishcheck:Unknown flag format in domain-list, 3 hex digits expected");
}
}
return rc;
}
@@ -79,13 +69,6 @@ int is_domainlist_ok(const struct cl_engine* engine)
return (engine && engine->domainlist_matcher) ? is_regex_ok(engine->domainlist_matcher) : 1;
}
void domainlist_cleanup(const struct cl_engine* engine)
{
if(engine && engine->domainlist_matcher) {
regex_list_cleanup(engine->domainlist_matcher);
}
}
void domainlist_done(struct cl_engine* engine)
{
if(engine && engine->domainlist_matcher) {

View File

@@ -69,13 +69,6 @@ int is_whitelist_ok(const struct cl_engine* engine)
return (engine && engine->whitelist_matcher) ? is_regex_ok(engine->whitelist_matcher) : 1;
}
void whitelist_cleanup(const struct cl_engine* engine)
{
if(engine && engine->whitelist_matcher) {
regex_list_cleanup(engine->whitelist_matcher);
}
}
void whitelist_done(struct cl_engine* engine)
{
if(engine && engine->whitelist_matcher) {

View File

@@ -39,6 +39,7 @@
#include <ctype.h>
#include "clamav.h"
#include "cltypes.h"
#include "others.h"
#include "mbox.h"
#include "message.h"
@@ -47,6 +48,7 @@
#include "phish_domaincheck_db.h"
#include "phish_whitelist.h"
#include "iana_tld.h"
#include "iana_cctld.h"
#define DOMAIN_REAL 1
@@ -140,8 +142,6 @@ static char empty_string[]="";
#define CLOAKED_URL "^"ANY_CLOAK"(\\."ANY_CLOAK"){0,3}$"
static const char cloaked_host_regex[] = CLOAKED_URL;
static const char tld_regex[] = "^"iana_tld"$";
static const char cctld_regex[] = "^"iana_cctld"$";
static const char dotnet[] = ".net";
static const char adonet[] = "ado.net";
static const char aspnet[] = "asp.net";
@@ -151,7 +151,10 @@ static const char gt[]="&gt";
static const char src_text[] = "src";
static const char href_text[] = "href";
static const char mailto[] = "mailto:";
static const char mailto_proto[] = "mailto://";
static const char https[]="https://";
static const char http[]="http://";
static const char ftp[] = "ftp://";
static const size_t href_text_len = sizeof(href_text);
static const size_t src_text_len = sizeof(src_text);
@@ -161,7 +164,10 @@ static const size_t aspnet_len = sizeof(aspnet)-1;
static const size_t lt_len = sizeof(lt)-1;
static const size_t gt_len = sizeof(gt)-1;
static const size_t mailto_len = sizeof(mailto)-1;
static const size_t mailto_proto_len = sizeof(mailto_proto)-1;
static const size_t https_len = sizeof(https)-1;
static const size_t http_len = sizeof(http)-1;
static const size_t ftp_len = sizeof(ftp)-1;
/* for urls, including mailto: urls, and (broken) http:www... style urls*/
/* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html
@@ -169,41 +175,13 @@ static const size_t https_len = sizeof(https)-1;
* So the 'safe' char class has been split up
* */
/* character classes */
#define URI_alpha "a-zA-Z"
#define URI_digit "0-9"
#define URI_safe_nodot "-$_@&"
#define URI_safe "-$_@.&"
#define URI_extra "!*\"'(),"
#define URI_hex "[0-9a-fA-f]"
#define URI_escape "%"URI_hex"{2}"
#define URI_xalpha "([" URI_safe URI_alpha URI_digit URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
#define URI_xalphas_nodot URI_xalpha_nodot"*"
#define URI_ialpha "["URI_alpha"]"URI_xalphas_nodot""
#define URI_xpalpha URI_xalpha"|\\+"
#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
#define URI_scheme URI_ialpha
#define URI_tld iana_tld
#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
#define URI_IP_digits "["URI_digit"]{1,3}"
#define URI_path_start "[/?:]?"
#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start
#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path
#define URI_numeric_URI "(http|https|ftp:(//)?)?"URI_numeric_path
#define URI_numeric_fragmentaddress URI_numeric_URI
#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
#define URI_URI2 URI_tld
#define URI_fragmentaddress1 URI_URI1
#define URI_fragmentaddress2 URI_URI2""URI_path_start
#define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+"
/*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please.
* there is fragmentaddress1, and 2 to work around the ISO limitation of 509 bytes max length for string constants*/
@@ -235,7 +213,6 @@ static int string_assign_concatenated(struct string* dest, const char* prefix, c
static void string_assign_null(struct string* dest);
static char *rfind(char *start, char c, size_t len);
static char hex2int(const unsigned char* src);
static int isTLD(const struct phishcheck* pchk,const char* str,int len);
static enum phish_status phishingCheck(const struct cl_engine* engine,struct url_check* urls);
static const char* phishing_ret_toString(enum phish_status rc);
@@ -416,7 +393,7 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p
}
tld = strrchr(realhost,'.');
rc = tld ? isTLD(s,tld,tld-realhost-1) : 0;
rc = tld ? !!in_tld_set(tld,tld-realhost-1) : 0;
if(rc < 0)
return rc;
if(rc)
@@ -438,28 +415,6 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p
return 0;
}
static int isCountryCode(const struct phishcheck* s,const char* str)
{
return str ? !cli_regexec(&s->preg_cctld,str,0,NULL,0) : 0;
}
static int isTLD(const struct phishcheck* pchk,const char* str,int len)
{
if (!str)
return 0;
else {
char* s = cli_malloc(len+1);
int rc;
if(!s)
return CL_EMEM;
strncpy(s,str,len);
s[len]='\0';
rc = !cli_regexec(&pchk->preg_tld,s,0,NULL,0);
free(s);
return rc ? 1 : 0;
}
}
/*
* memrchr isn't standard, so I use this
@@ -486,7 +441,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct
string_assign(dest,host);
return;
}
if(isCountryCode(pchk,tld+1)) {
if(in_cctld_set(tld+1, strlen(tld+1))) {
const char* countrycode = tld+1;
tld = rfind(host->data,'.',tld-host->data-1);
if(!tld) {
@@ -495,7 +450,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct
string_assign(dest,host);
return;
}
if(!isTLD(pchk,tld+1,countrycode-tld-2)) {
if(!in_tld_set(tld+1, countrycode-tld-2)) {
string_assign_ref(dest,host,tld+1);
return;/*it was a name like: subdomain.domain.uk, return domain.uk*/
}
@@ -737,11 +692,7 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
/* @end points to last character we want to be part of the URL */
end = host_begin + host_len - 1;
}
/* terminate URL with a slash, except when we're at end of string */
if(host_begin[host_len]) {
host_begin[host_len] = '/';
end++;
}
host_begin[host_len] = '\0';
/* convert hostname to lowercase, but only hostname! */
str_make_lowercase(host_begin, host_len);
/* some broken MUAs put > in the href, and then
@@ -797,6 +748,40 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
if(!ctx->found_possibly_unwanted)
*ctx->virname=NULL;
#if 0
FILE *f = fopen("/home/edwin/quarantine/urls","r");
if(!f)
abort();
while(!feof(f)) {
struct url_check urls;
char line1[4096];
char line2[4096];
char line3[4096];
fgets(line1, sizeof(line1), f);
fgets(line2, sizeof(line2), f);
fgets(line3, sizeof(line3), f);
if(strcmp(line3, "\n") != 0) {
strcpy(line1, line2);
strcpy(line2, line3);
fgets(line3, sizeof(line3), f);
while(strcmp(line3, "\n") != 0) {
fgets(line3, sizeof(line3),f);
}
}
urls.flags = CL_PHISH_ALL_CHECKS;
urls.link_type = 0;
string_init_c(&urls.realLink, line1);
string_init_c(&urls.displayLink, line2);
string_init_c(&urls.pre_fixup.pre_displayLink, NULL);
urls.realLink.refcount=-1;
urls.displayLink.refcount=-1;
int rc = phishingCheck(ctx->engine, &urls);
//printf("%d\n",rc);
}
fclose(f);
return 0;
#endif
for(i=0;i<hrefs->count;i++)
if(hrefs->contents[i]) {
struct url_check urls;
@@ -928,44 +913,7 @@ int phishing_init(struct cl_engine* engine)
return CL_EFORMAT;
}
if(build_regex(&pchk->preg_cctld,cctld_regex,1)) {
free(pchk);
engine->phishcheck = NULL;
return CL_EFORMAT;
}
if(build_regex(&pchk->preg_tld,tld_regex,1)) {
free_regex(&pchk->preg_cctld);
free(pchk);
engine->phishcheck = NULL;
return CL_EFORMAT;
}
url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$");
if(!url_regex || build_regex(&pchk->preg,url_regex,1)) {
free_regex(&pchk->preg_cctld);
free_regex(&pchk->preg_tld);
free(url_regex);
free(pchk);
engine->phishcheck = NULL;
return CL_EFORMAT;
}
free(url_regex);
realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$");
if(!realurl_regex || build_regex(&pchk->preg_realurl, realurl_regex,1)) {
free_regex(&pchk->preg_cctld);
free_regex(&pchk->preg_tld);
free_regex(&pchk->preg);
free(url_regex);
free(realurl_regex);
free(pchk);
engine->phishcheck = NULL;
return CL_EFORMAT;
}
free(realurl_regex);
if(build_regex(&pchk->preg_numeric,numeric_url_regex,1)) {
free_regex(&pchk->preg_cctld);
free_regex(&pchk->preg_tld);
free_regex(&pchk->preg);
free_regex(&pchk->preg_realurl);
free(pchk);
engine->phishcheck = NULL;
return CL_EFORMAT;
@@ -980,12 +928,8 @@ void phishing_done(struct cl_engine* engine)
struct phishcheck* pchk = engine->phishcheck;
cli_dbgmsg("Cleaning up phishcheck\n");
if(pchk && !pchk->is_disabled) {
free_regex(&pchk->preg);
free_regex(&pchk->preg_hexurl);
free_regex(&pchk->preg_cctld);
free_regex(&pchk->preg_tld);
free_regex(&pchk->preg_numeric);
free_regex(&pchk->preg_realurl);
pchk->is_disabled = 1;
}
whitelist_done(engine);
@@ -998,22 +942,165 @@ void phishing_done(struct cl_engine* engine)
cli_dbgmsg("Phishcheck cleaned up\n");
}
/*ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*/
static const uint8_t URI_alpha[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/*!"$%&'()*,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
static const uint8_t URI_xalpha_nodot[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/*!"$%&'()*+,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
static const uint8_t URI_xpalpha_nodot[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
static inline int validate_uri_xalphas_nodot(const char *start, const char *end)
{
const unsigned char *p = start;
for(p=start;p < (const unsigned char*)end; p++) {
if(!URI_xalpha_nodot[*p])
return 0;
}
return 1;
}
static inline int validate_uri_xpalphas_nodot(const char *start, const char *end)
{
const unsigned char *p = start;
for(p=start;p < (const unsigned char*)end; p++) {
if(!URI_xpalpha_nodot[*p])
return 0;
}
/* must have at least on char */
return p > (const unsigned char*)start;
}
static inline int validate_uri_ialpha(const char *start, const char *end)
{
const unsigned char *p = start;
if(start >= end || !URI_alpha[*p])
return 0;
return validate_uri_xalphas_nodot(start + 1, end);
}
/*
* Only those URLs are identified as URLs for which phishing detection can be performed.
*/
static int isURL(const struct phishcheck* pchk,const char* URL)
static int isURL(const struct phishcheck* pchk,const char* URL, int accept_anyproto)
{
return URL ? !cli_regexec(&pchk->preg,URL,0,NULL,0) : 0;
const char *start = NULL, *p, *q;
if(!URL)
return 0;
switch (URL[0]) {
case 'h':
if (strncmp(URL, https, https_len) == 0)
start = URL + https_len;
else if (strncmp(URL, http, http_len) == 0)
start = URL + http_len;
break;
case 'f':
if (strncmp(URL, ftp, ftp_len) == 0)
start = URL + ftp_len;
break;
case 'm':
if (strncmp(URL, mailto_proto, mailto_proto_len) == 0)
start = URL + mailto_proto_len;
break;
}
if(start) {
if(start[0] == '\0')
return 0;/* empty URL */
/* has a valid protocol, it is a URL */
return 1;
}
start = accept_anyproto ? strchr(URL, ':') : NULL;
if(start) {
/* validate URI scheme */
if(validate_uri_ialpha(URL, start)) {
if(start[1] == '/' && start[2] == '/')
start += 3; /* skip :// */
else
start++;
}
else
start = URL; /* scheme invalid */
} else
start = URL;
p = start;
do {
q = strchr(p, '.');
if(q) {
if(!validate_uri_xpalphas_nodot(p, q))
return 0;
p = q+1;
}
} while(q);
if (p == start) /* must have at least one dot in the URL */
return 0;
return !!in_tld_set(p, strlen(p));
}
/*
* Check if this is a real URL, which basically means to check if it has a known URL scheme (http,https,ftp).
* This prevents false positives with outbind:// and blocked:: links.
*/
#if 0
static int isRealURL(const struct phishcheck* pchk,const char* URL)
{
return URL ? !cli_regexec(&pchk->preg_realurl,URL,0,NULL,0) : 0;
}
#endif
static int isNumericURL(const struct phishcheck* pchk,const char* URL)
{
@@ -1139,7 +1226,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
urls->displayLink.data);
if((!isURL(pchk, urls->displayLink.data) || !isRealURL(pchk, urls->realLink.data) ) &&
if((!isURL(pchk, urls->displayLink.data, 1) || !isURL(pchk, urls->realLink.data, 0) ) &&
( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
!(phishy&PHISHY_NUMERIC_IP))) {
cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);

View File

@@ -44,10 +44,6 @@ struct string {
};
struct phishcheck {
regex_t preg;
regex_t preg_realurl;
regex_t preg_tld;
regex_t preg_cctld;
regex_t preg_numeric;
regex_t preg_hexurl;
int is_disabled;

View File

@@ -1839,6 +1839,12 @@ int cl_build(struct cl_engine *engine)
}
}
if((ret = cli_build_regex_list(engine->whitelist_matcher))) {
return ret;
}
if((ret = cli_build_regex_list(engine->domainlist_matcher))) {
return ret;
}
cli_md5db_build(engine->md5_mdb);
cli_freeign(engine);
cli_dconf_print(engine->dconf);

View File

File diff suppressed because it is too large Load Diff

View File

@@ -24,39 +24,37 @@
#ifndef _REGEX_LIST_H
#define _REGEX_LIST_H
#ifdef NDEBUG
#define massert(x) (void)(0)
#else
/*debug version, massert enabled*/
#define __massert_fail(expr,file,line) (void)cli_errmsg("Assertion failed at %s:%d\n %s\n",file,line,expr)
#define massert(expr) ((void) ((expr) ? (void)0 : (__massert_fail (#expr,__FILE__,__LINE__))))
#endif
#include "phishcheck.h"
#include "readdb.h"
#include "matcher.h"
#include <zlib.h> /* for gzFile */
struct node_stack {
struct tree_node** data;
size_t capacity;
size_t cnt;
struct regex_list {
const char *pattern;
regex_t preg;
struct regex_list *nxt;
};
struct filter {
uint32_t B[65536];
uint32_t end_fast[256];
uint32_t end[65536];
unsigned long m;
};
struct regex_matcher {
struct cli_matcher* root_hosts;
struct tree_node* root_regex;
struct tree_node* root_regex_hostonly;
struct node_stack node_stack;
struct node_stack node_stack_alt;
size_t root_hosts_cnt;
int list_inited;
int list_loaded;
int list_built;
struct hashtable suffix_hash;
size_t suffix_cnt;
struct regex_list **suffix_regexes;
struct cli_matcher suffixes;
struct filter filter;
int list_inited:2;
int list_loaded:2;
int list_built:2;
};
int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char** info,int is_whitelist);
int cli_build_regex_list(struct regex_matcher* matcher);
int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char **info, int is_whitelist);
int init_regex_list(struct regex_matcher* matcher);
int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist,struct cli_dbio *dbio);
void regex_list_cleanup(struct regex_matcher* matcher);