HWPML: added hwpml_keys for hwpml parsing

This commit is contained in:
Kevin Lin
2015-12-15 13:01:40 -05:00
parent 904fe15510
commit d2eea44a6d
5 changed files with 125 additions and 23 deletions

View File

@@ -23,8 +23,13 @@
#include "clamav-config.h"
#endif
#if HAVE_ICONV
#include <iconv.h>
#if HAVE_LIBXML2
#ifdef _WIN32
#ifndef LIBXML_WRITER_ENABLED
#define LIBXML_WRITER_ENABLED 1
#endif
#endif
#include <libxml/xmlreader.h>
#endif
#include <sys/types.h>
@@ -39,6 +44,8 @@
#include "str.h"
#include "others.h"
#include "scanners.h"
#include "msxml_parser.h"
#include "msxml.h"
#include "json_api.h"
#include "hwp.h"
#if HAVE_JSON
@@ -47,6 +54,7 @@
#define HWP5_DEBUG 0
#define HWP3_DEBUG 1
#define HWPML_DEBUG 1
#if HWP5_DEBUG
#define hwp5_debug(...) cli_dbgmsg(__VA_ARGS__)
#else
@@ -57,6 +65,11 @@
#else
#define hwp3_debug(...) ;
#endif
#if HWPML_DEBUG
#define hwpml_debug(...) cli_dbgmsg(__VA_ARGS__)
#else
#define hwpml_debug(...) ;
#endif
typedef int (*hwp_cb )(void *cbdata, int fd, cli_ctx *ctx);
static int decompress_and_callback(cli_ctx *ctx, fmap_t *input, off_t at, size_t len, const char *parent, hwp_cb cb, void *cbdata)
@@ -488,8 +501,8 @@ static inline int parsehwp3_docsummary(cli_ctx *ctx, off_t offset)
return ret;
}
#else
UNUSED(ctx);
UNUSED(offset);
UNUSEDPARAM(ctx);
UNUSEDPARAM(offset);
#endif
return CL_SUCCESS;
}
@@ -637,3 +650,80 @@ int cli_scanhwp3(cli_ctx *ctx)
return ret;
}
/*** HWPML (hijacking the msxml parser) ***/
static const struct key_entry hwpml_keys[] = {
{ "hwpml", "HWPML", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
/* HEAD - Document Properties */
{ "head", "Head", MSXML_JSON_ROOT },
{ "docsummary", "DocumentProperties", MSXML_JSON_WRKPTR },
{ "title", "Title", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "author", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "date", "Date", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "docsetting", "DocumentSettings", MSXML_JSON_WRKPTR },
{ "beginnumber", "BeginNumber", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
{ "caretpos", "CaretPos", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
{ "bindatalist", "BinDataList", MSXML_JSON_WRKPTR },
{ "binitem", "BinItem", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
{ "facenamelist", "FaceNameList", MSXML_IGNORE_ELEM }, /* fonts list */
{ "borderfilllist", "BorderFillList", MSXML_IGNORE_ELEM }, /* borders list */
{ "charshapelist", "CharShapeList", MSXML_IGNORE_ELEM }, /* character shapes */
{ "tabdeflist", "TableDefList", MSXML_IGNORE_ELEM }, /* table defs */
{ "numberinglist", "NumberingList", MSXML_IGNORE_ELEM }, /* numbering list */
{ "parashapelist", "ParagraphShapeList", MSXML_IGNORE_ELEM }, /* paragraph shapes */
{ "stylelist", "StyleList", MSXML_IGNORE_ELEM }, /* styles */
{ "compatibledocument", "WordCompatibility", MSXML_IGNORE_ELEM }, /* word compatibility data */
/* BODY - Document Contents */
{ "body", "Body", MSXML_IGNORE_ELEM }, /* document contents (we could build a document contents summary */
/* TAIL - Document Attachments */
{ "tail", "Tail", MSXML_JSON_ROOT },
{ "bindatastorage", "BinaryDataStorage", MSXML_JSON_WRKPTR },
{ "bindata", "BinaryData", MSXML_SCAN_B64 | MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
{ "scriptcode", "ScriptCodeStorage", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
{ "scriptheader", "ScriptHeader", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "scriptsource", "ScriptSource", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }
};
static size_t num_hwpml_keys = sizeof(hwpml_keys) / sizeof(struct key_entry);
int cli_scanhwpml(cli_ctx *ctx)
{
#if HAVE_LIBXML2
struct msxml_cbdata cbdata;
xmlTextReaderPtr reader = NULL;
int state, ret = CL_SUCCESS;
cli_dbgmsg("in cli_scanhwpml()\n");
if (!ctx)
return CL_ENULLARG;
memset(&cbdata, 0, sizeof(cbdata));
cbdata.map = *ctx->fmap;
reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "hwpml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
if (!reader) {
cli_dbgmsg("cli_scanhwpml: cannot intialize xmlReader\n");
#if HAVE_JSON
ret = cli_json_parse_error(ctx->wrkproperty, "HWPML_ERROR_XML_READER_IO");
#endif
return ret; // libxml2 failed!
}
ret = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, 1);
xmlTextReaderClose(reader);
xmlFreeTextReader(reader);
return ret;
#else
UNUSEDPARAM(ctx);
cli_dbgmsg("in cli_scanhwpml()\n");
cli_dbgmsg("cli_scanhwpml: scanning hwpml documents requires libxml2!\n");
return CL_SUCCESS;
#endif
}

View File

@@ -51,4 +51,7 @@ int cli_scanhwp5_stream(cli_ctx *ctx, hwp5_header_t *hwp5, char *name, int fd);
/* HWP 3.0 - UNIQUE FORMAT */
int cli_scanhwp3(cli_ctx *ctx);
/* HWPML - SINGLE XML DOCUMENT (similar to MSXML) */
int cli_scanhwpml(cli_ctx *ctx);
#endif /* __HWP_H__ */

View File

@@ -79,24 +79,6 @@ static const struct key_entry msxml_keys[] = {
};
static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry);
enum msxml_state {
MSXML_STATE_NORMAL = 0,
MSXML_STATE_ENTITY_START_1,
MSXML_STATE_ENTITY_START_2,
MSXML_STATE_ENTITY_HEX,
MSXML_STATE_ENTITY_DEC,
MSXML_STATE_ENTITY_CLOSE,
MSXML_STATE_ENTITY_NONE
};
struct msxml_cbdata {
enum msxml_state state;
fmap_t *map;
const unsigned char *window;
off_t winpos, mappos;
size_t winsize;
};
static inline size_t msxml_read_cb_new_window(struct msxml_cbdata *cbdata)
{
const unsigned char *new_window = NULL;

View File

@@ -30,6 +30,25 @@
#include "others.h"
enum msxml_state {
MSXML_STATE_NORMAL = 0,
MSXML_STATE_ENTITY_START_1,
MSXML_STATE_ENTITY_START_2,
MSXML_STATE_ENTITY_HEX,
MSXML_STATE_ENTITY_DEC,
MSXML_STATE_ENTITY_CLOSE,
MSXML_STATE_ENTITY_NONE
};
struct msxml_cbdata {
enum msxml_state state;
fmap_t *map;
const unsigned char *window;
off_t winpos, mappos;
size_t winsize;
};
int msxml_read_cb(void *ctx, char *buffer, int len);
int cli_scanmsxml(cli_ctx *ctx);
#endif /* __MSXML_H */

View File

@@ -2275,6 +2275,9 @@ static int cli_scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_file_
case CL_TYPE_XML_XL:
ret = cli_scanmsxml(ctx);
break;
case CL_TYPE_XML_HWP:
ret = cli_scanhwpml(ctx);
break;
case CL_TYPE_RARSFX:
if(type != CL_TYPE_RAR && have_rar && SCAN_ARCHIVE && (DCONF_ARCH & ARCH_CONF_RAR)) {
char *tmpname = NULL;
@@ -2681,7 +2684,8 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type)
type == CL_TYPE_OOXML_XL ||
type == CL_TYPE_XML_WORD ||
type == CL_TYPE_XML_XL ||
type == CL_TYPE_HWP3) {
type == CL_TYPE_HWP3 ||
type == CL_TYPE_XML_HWP) {
ctx->properties = json_object_new_object();
if (NULL == ctx->properties) {
cli_errmsg("magic_scandesc: no memory for json properties object\n");
@@ -2851,6 +2855,10 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type)
ret = cli_scanmsxml(ctx);
break;
case CL_TYPE_XML_HWP:
ret = cli_scanhwpml(ctx);
break;
case CL_TYPE_XDP:
ret = cli_scanxdp(ctx);
break;