diff --git a/libclamav/hwp.c b/libclamav/hwp.c index d855aad2e..ecca124fd 100644 --- a/libclamav/hwp.c +++ b/libclamav/hwp.c @@ -23,8 +23,13 @@ #include "clamav-config.h" #endif -#if HAVE_ICONV -#include +#if HAVE_LIBXML2 +#ifdef _WIN32 +#ifndef LIBXML_WRITER_ENABLED +#define LIBXML_WRITER_ENABLED 1 +#endif +#endif +#include #endif #include @@ -39,6 +44,8 @@ #include "str.h" #include "others.h" #include "scanners.h" +#include "msxml_parser.h" +#include "msxml.h" #include "json_api.h" #include "hwp.h" #if HAVE_JSON @@ -47,6 +54,7 @@ #define HWP5_DEBUG 0 #define HWP3_DEBUG 1 +#define HWPML_DEBUG 1 #if HWP5_DEBUG #define hwp5_debug(...) cli_dbgmsg(__VA_ARGS__) #else @@ -57,6 +65,11 @@ #else #define hwp3_debug(...) ; #endif +#if HWPML_DEBUG +#define hwpml_debug(...) cli_dbgmsg(__VA_ARGS__) +#else +#define hwpml_debug(...) ; +#endif typedef int (*hwp_cb )(void *cbdata, int fd, cli_ctx *ctx); static int decompress_and_callback(cli_ctx *ctx, fmap_t *input, off_t at, size_t len, const char *parent, hwp_cb cb, void *cbdata) @@ -488,8 +501,8 @@ static inline int parsehwp3_docsummary(cli_ctx *ctx, off_t offset) return ret; } #else - UNUSED(ctx); - UNUSED(offset); + UNUSEDPARAM(ctx); + UNUSEDPARAM(offset); #endif return CL_SUCCESS; } @@ -637,3 +650,80 @@ int cli_scanhwp3(cli_ctx *ctx) return ret; } + +/*** HWPML (hijacking the msxml parser) ***/ + +static const struct key_entry hwpml_keys[] = { + { "hwpml", "HWPML", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB }, + + /* HEAD - Document Properties */ + { "head", "Head", MSXML_JSON_ROOT }, + { "docsummary", "DocumentProperties", MSXML_JSON_WRKPTR }, + { "title", "Title", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, + { "author", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, + { "date", "Date", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, + { "docsetting", "DocumentSettings", MSXML_JSON_WRKPTR }, + { "beginnumber", "BeginNumber", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, + { "caretpos", "CaretPos", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, + { "bindatalist", "BinDataList", MSXML_JSON_WRKPTR }, + { "binitem", "BinItem", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, + { "facenamelist", "FaceNameList", MSXML_IGNORE_ELEM }, /* fonts list */ + { "borderfilllist", "BorderFillList", MSXML_IGNORE_ELEM }, /* borders list */ + { "charshapelist", "CharShapeList", MSXML_IGNORE_ELEM }, /* character shapes */ + { "tabdeflist", "TableDefList", MSXML_IGNORE_ELEM }, /* table defs */ + { "numberinglist", "NumberingList", MSXML_IGNORE_ELEM }, /* numbering list */ + { "parashapelist", "ParagraphShapeList", MSXML_IGNORE_ELEM }, /* paragraph shapes */ + { "stylelist", "StyleList", MSXML_IGNORE_ELEM }, /* styles */ + { "compatibledocument", "WordCompatibility", MSXML_IGNORE_ELEM }, /* word compatibility data */ + + /* BODY - Document Contents */ + { "body", "Body", MSXML_IGNORE_ELEM }, /* document contents (we could build a document contents summary */ + + /* TAIL - Document Attachments */ + { "tail", "Tail", MSXML_JSON_ROOT }, + { "bindatastorage", "BinaryDataStorage", MSXML_JSON_WRKPTR }, + { "bindata", "BinaryData", MSXML_SCAN_B64 | MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, + { "scriptcode", "ScriptCodeStorage", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, + { "scriptheader", "ScriptHeader", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, + { "scriptsource", "ScriptSource", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE } +}; +static size_t num_hwpml_keys = sizeof(hwpml_keys) / sizeof(struct key_entry); + +int cli_scanhwpml(cli_ctx *ctx) +{ +#if HAVE_LIBXML2 + struct msxml_cbdata cbdata; + xmlTextReaderPtr reader = NULL; + int state, ret = CL_SUCCESS; + + cli_dbgmsg("in cli_scanhwpml()\n"); + + if (!ctx) + return CL_ENULLARG; + + memset(&cbdata, 0, sizeof(cbdata)); + cbdata.map = *ctx->fmap; + + reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "hwpml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS); + if (!reader) { + cli_dbgmsg("cli_scanhwpml: cannot intialize xmlReader\n"); + +#if HAVE_JSON + ret = cli_json_parse_error(ctx->wrkproperty, "HWPML_ERROR_XML_READER_IO"); +#endif + return ret; // libxml2 failed! + } + + ret = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, 1); + + xmlTextReaderClose(reader); + xmlFreeTextReader(reader); + return ret; +#else + UNUSEDPARAM(ctx); + cli_dbgmsg("in cli_scanhwpml()\n"); + cli_dbgmsg("cli_scanhwpml: scanning hwpml documents requires libxml2!\n"); + + return CL_SUCCESS; +#endif +} diff --git a/libclamav/hwp.h b/libclamav/hwp.h index a9ddb9f6d..f7264c28a 100644 --- a/libclamav/hwp.h +++ b/libclamav/hwp.h @@ -51,4 +51,7 @@ int cli_scanhwp5_stream(cli_ctx *ctx, hwp5_header_t *hwp5, char *name, int fd); /* HWP 3.0 - UNIQUE FORMAT */ int cli_scanhwp3(cli_ctx *ctx); +/* HWPML - SINGLE XML DOCUMENT (similar to MSXML) */ +int cli_scanhwpml(cli_ctx *ctx); + #endif /* __HWP_H__ */ diff --git a/libclamav/msxml.c b/libclamav/msxml.c index e7365eab7..31ea48468 100644 --- a/libclamav/msxml.c +++ b/libclamav/msxml.c @@ -79,24 +79,6 @@ static const struct key_entry msxml_keys[] = { }; static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry); -enum msxml_state { - MSXML_STATE_NORMAL = 0, - MSXML_STATE_ENTITY_START_1, - MSXML_STATE_ENTITY_START_2, - MSXML_STATE_ENTITY_HEX, - MSXML_STATE_ENTITY_DEC, - MSXML_STATE_ENTITY_CLOSE, - MSXML_STATE_ENTITY_NONE -}; - -struct msxml_cbdata { - enum msxml_state state; - fmap_t *map; - const unsigned char *window; - off_t winpos, mappos; - size_t winsize; -}; - static inline size_t msxml_read_cb_new_window(struct msxml_cbdata *cbdata) { const unsigned char *new_window = NULL; diff --git a/libclamav/msxml.h b/libclamav/msxml.h index 3b7843688..600ea10f6 100644 --- a/libclamav/msxml.h +++ b/libclamav/msxml.h @@ -30,6 +30,25 @@ #include "others.h" +enum msxml_state { + MSXML_STATE_NORMAL = 0, + MSXML_STATE_ENTITY_START_1, + MSXML_STATE_ENTITY_START_2, + MSXML_STATE_ENTITY_HEX, + MSXML_STATE_ENTITY_DEC, + MSXML_STATE_ENTITY_CLOSE, + MSXML_STATE_ENTITY_NONE +}; + +struct msxml_cbdata { + enum msxml_state state; + fmap_t *map; + const unsigned char *window; + off_t winpos, mappos; + size_t winsize; +}; + +int msxml_read_cb(void *ctx, char *buffer, int len); int cli_scanmsxml(cli_ctx *ctx); #endif /* __MSXML_H */ diff --git a/libclamav/scanners.c b/libclamav/scanners.c index b8e55fcda..b053a8439 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -2275,6 +2275,9 @@ static int cli_scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_file_ case CL_TYPE_XML_XL: ret = cli_scanmsxml(ctx); break; + case CL_TYPE_XML_HWP: + ret = cli_scanhwpml(ctx); + break; case CL_TYPE_RARSFX: if(type != CL_TYPE_RAR && have_rar && SCAN_ARCHIVE && (DCONF_ARCH & ARCH_CONF_RAR)) { char *tmpname = NULL; @@ -2681,7 +2684,8 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type) type == CL_TYPE_OOXML_XL || type == CL_TYPE_XML_WORD || type == CL_TYPE_XML_XL || - type == CL_TYPE_HWP3) { + type == CL_TYPE_HWP3 || + type == CL_TYPE_XML_HWP) { ctx->properties = json_object_new_object(); if (NULL == ctx->properties) { cli_errmsg("magic_scandesc: no memory for json properties object\n"); @@ -2851,6 +2855,10 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type) ret = cli_scanmsxml(ctx); break; + case CL_TYPE_XML_HWP: + ret = cli_scanhwpml(ctx); + break; + case CL_TYPE_XDP: ret = cli_scanxdp(ctx); break;