Files
nzbget/FeedFile.cpp

514 lines
11 KiB
C++

/*
* This file is part of nzbget
*
* Copyright (C) 2013 Andrey Prygunkov <hugbug@users.sourceforge.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* $Revision$
* $Date$
*
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef WIN32
#include "win32.h"
#endif
#include <string.h>
#include <list>
#ifdef WIN32
#include <comutil.h>
#import <msxml.tlb> named_guids
using namespace MSXML;
#else
#include <libxml/parser.h>
#include <libxml/xmlreader.h>
#include <libxml/xmlerror.h>
#include <libxml/entities.h>
#endif
#include "nzbget.h"
#include "FeedFile.h"
#include "Log.h"
#include "DownloadInfo.h"
#include "Options.h"
#include "Util.h"
extern Options* g_pOptions;
FeedFile::FeedFile(const char* szFileName)
{
debug("Creating FeedFile");
m_szFileName = strdup(szFileName);
m_pFeedItemInfos = new FeedItemInfos();
m_pFeedItemInfos->Retain();
#ifndef WIN32
m_pFeedItemInfo = NULL;
m_szTagContent = NULL;
m_iTagContentLen = 0;
#endif
}
FeedFile::~FeedFile()
{
debug("Destroying FeedFile");
// Cleanup
if (m_szFileName)
{
free(m_szFileName);
}
m_pFeedItemInfos->Release();
#ifndef WIN32
if (m_pFeedItemInfo)
{
delete m_pFeedItemInfo;
}
if (m_szTagContent)
{
free(m_szTagContent);
}
#endif
}
void FeedFile::LogDebugInfo()
{
debug(" FeedFile %s", m_szFileName);
}
void FeedFile::AddItem(FeedItemInfo* pFeedItemInfo)
{
m_pFeedItemInfos->push_back(pFeedItemInfo);
}
void FeedFile::ParseSubject(FeedItemInfo* pFeedItemInfo)
{
// if title has quatation marks we use only part within quatation marks
char* p = (char*)pFeedItemInfo->GetTitle();
char* start = strchr(p, '\"');
if (start)
{
start++;
char* end = strchr(start + 1, '\"');
if (end)
{
int len = (int)(end - start);
char* point = strchr(start + 1, '.');
if (point && point < end)
{
char* filename = (char*)malloc(len + 1);
strncpy(filename, start, len);
filename[len] = '\0';
char* ext = strrchr(filename, '.');
if (ext && !strcasecmp(ext, ".par2"))
{
*ext = '\0';
}
pFeedItemInfo->SetFilename(filename);
free(filename);
return;
}
}
}
pFeedItemInfo->SetFilename(pFeedItemInfo->GetTitle());
}
#ifdef WIN32
FeedFile* FeedFile::Create(const char* szFileName)
{
CoInitialize(NULL);
HRESULT hr;
MSXML::IXMLDOMDocumentPtr doc;
hr = doc.CreateInstance(MSXML::CLSID_DOMDocument);
if (FAILED(hr))
{
return NULL;
}
// Load the XML document file...
doc->put_resolveExternals(VARIANT_FALSE);
doc->put_validateOnParse(VARIANT_FALSE);
doc->put_async(VARIANT_FALSE);
// filename needs to be properly encoded
char* szURL = (char*)malloc(strlen(szFileName)*3 + 1);
EncodeURL(szFileName, szURL);
debug("url=\"%s\"", szURL);
_variant_t v(szURL);
free(szURL);
VARIANT_BOOL success = doc->load(v);
if (success == VARIANT_FALSE)
{
_bstr_t r(doc->GetparseError()->reason);
const char* szErrMsg = r;
error("Error parsing rss feed: %s", szErrMsg);
return NULL;
}
FeedFile* pFile = new FeedFile(szFileName);
if (!pFile->ParseFeed(doc))
{
delete pFile;
pFile = NULL;
}
return pFile;
}
void FeedFile::EncodeURL(const char* szFilename, char* szURL)
{
while (char ch = *szFilename++)
{
if (('0' <= ch && ch <= '9') ||
('a' <= ch && ch <= 'z') ||
('A' <= ch && ch <= 'Z') )
{
*szURL++ = ch;
}
else
{
*szURL++ = '%';
int a = ch >> 4;
*szURL++ = a > 9 ? a - 10 + 'a' : a + '0';
a = ch & 0xF;
*szURL++ = a > 9 ? a - 10 + 'a' : a + '0';
}
}
*szURL = NULL;
}
bool FeedFile::ParseFeed(IUnknown* nzb)
{
MSXML::IXMLDOMDocumentPtr doc = nzb;
MSXML::IXMLDOMNodePtr root = doc->documentElement;
MSXML::IXMLDOMNodeListPtr itemList = root->selectNodes("/rss/channel/item");
for (int i = 0; i < itemList->Getlength(); i++)
{
MSXML::IXMLDOMNodePtr node = itemList->Getitem(i);
FeedItemInfo* pFeedItemInfo = new FeedItemInfo();
AddItem(pFeedItemInfo);
MSXML::IXMLDOMNodePtr tag;
MSXML::IXMLDOMNodePtr attr;
// <title>Debian 6</title>
tag = node->selectSingleNode("title");
if (!tag)
{
// bad rss feed
return false;
}
_bstr_t title(tag->Gettext());
pFeedItemInfo->SetTitle(title);
ParseSubject(pFeedItemInfo);
// <pubDate>Wed, 26 Jun 2013 00:02:54 -0600</pubDate>
tag = node->selectSingleNode("pubDate");
if (tag)
{
_bstr_t time(tag->Gettext());
time_t unixtime = Util::ParseRfc822DateTime(time);
if (unixtime > 0)
{
pFeedItemInfo->SetTime(unixtime);
}
}
// <category>Movies &gt; HD</category>
tag = node->selectSingleNode("category");
if (tag)
{
_bstr_t category(tag->Gettext());
pFeedItemInfo->SetCategory(category);
}
//<enclosure url="http://myindexer.com/fetch/9eeb264aecce961a6e0d" length="150263340" type="application/x-nzb" />
tag = node->selectSingleNode("enclosure");
if (tag)
{
attr = tag->Getattributes()->getNamedItem("url");
if (attr)
{
_bstr_t url(attr->Gettext());
pFeedItemInfo->SetUrl(url);
}
attr = tag->Getattributes()->getNamedItem("length");
if (attr)
{
_bstr_t size(attr->Gettext());
long long lSize = atoll(size);
pFeedItemInfo->SetSize(lSize);
}
}
if (!pFeedItemInfo->GetUrl())
{
// <link>https://nzb.org/fetch/334534ce/4364564564</link>
tag = node->selectSingleNode("link");
if (!tag)
{
// bad rss feed
return false;
}
_bstr_t link(tag->Gettext());
pFeedItemInfo->SetUrl(link);
}
// newznab special
//<newznab:attr name="size" value="5423523453534" />
if (pFeedItemInfo->GetSize() == 0)
{
tag = node->selectSingleNode("newznab:attr[@name='size']");
if (tag)
{
attr = tag->Getattributes()->getNamedItem("value");
if (attr)
{
_bstr_t size(attr->Gettext());
long long lSize = atoll(size);
pFeedItemInfo->SetSize(lSize);
}
}
}
}
return true;
}
#else
FeedFile* FeedFile::Create(const char* szFileName)
{
FeedFile* pFile = new FeedFile(szFileName);
xmlSAXHandler SAX_handler = {0};
SAX_handler.startElement = reinterpret_cast<startElementSAXFunc>(SAX_StartElement);
SAX_handler.endElement = reinterpret_cast<endElementSAXFunc>(SAX_EndElement);
SAX_handler.characters = reinterpret_cast<charactersSAXFunc>(SAX_characters);
SAX_handler.error = reinterpret_cast<errorSAXFunc>(SAX_error);
SAX_handler.getEntity = reinterpret_cast<getEntitySAXFunc>(SAX_getEntity);
pFile->m_bIgnoreNextError = false;
int ret = xmlSAXUserParseFile(&SAX_handler, pFile, szFileName);
if (ret != 0)
{
error("Failed to parse rss feed");
delete pFile;
pFile = NULL;
}
return pFile;
}
void FeedFile::Parse_StartElement(const char *name, const char **atts)
{
if (m_szTagContent)
{
free(m_szTagContent);
m_szTagContent = NULL;
m_iTagContentLen = 0;
}
if (!strcmp("item", name))
{
m_pFeedItemInfo = new FeedItemInfo();
}
else if (!strcmp("enclosure", name) && m_pFeedItemInfo)
{
//<enclosure url="http://myindexer.com/fetch/9eeb264aecce961a6e0d" length="150263340" type="application/x-nzb" />
for (; *atts; atts+=2)
{
if (!strcmp("url", atts[0]))
{
char* szUrl = strdup(atts[1]);
WebUtil::XmlDecode(szUrl);
m_pFeedItemInfo->SetUrl(szUrl);
free(szUrl);
}
else if (!strcmp("length", atts[0]))
{
long long lSize = atoll(atts[1]);
m_pFeedItemInfo->SetSize(lSize);
}
}
}
else if (!strcmp("newznab:attr", name) && m_pFeedItemInfo && m_pFeedItemInfo->GetSize() == 0)
{
//<newznab:attr name="size" value="5423523453534" />
if (atts[0] && atts[1] && atts[2] && atts[3] &&
!strcmp("name", atts[0]) && !strcmp("size", atts[1]) && !strcmp("value", atts[2]))
{
long long lSize = atoll(atts[3]);
m_pFeedItemInfo->SetSize(lSize);
}
}
}
void FeedFile::Parse_EndElement(const char *name)
{
if (!strcmp("item", name))
{
// Close the file element, add the new file to file-list
AddItem(m_pFeedItemInfo);
m_pFeedItemInfo = NULL;
}
else if (!strcmp("title", name) && m_pFeedItemInfo)
{
m_pFeedItemInfo->SetTitle(m_szTagContent);
ParseSubject(m_pFeedItemInfo);
m_szTagContent = NULL;
m_iTagContentLen = 0;
}
else if (!strcmp("link", name) && m_pFeedItemInfo &&
(!m_pFeedItemInfo->GetUrl() || strlen(m_pFeedItemInfo->GetUrl()) == 0))
{
m_pFeedItemInfo->SetUrl(m_szTagContent);
m_szTagContent = NULL;
m_iTagContentLen = 0;
}
else if (!strcmp("category", name) && m_pFeedItemInfo)
{
m_pFeedItemInfo->SetCategory(m_szTagContent);
m_szTagContent = NULL;
m_iTagContentLen = 0;
}
else if (!strcmp("pubDate", name) && m_pFeedItemInfo)
{
time_t unixtime = Util::ParseRfc822DateTime(m_szTagContent);
if (unixtime > 0)
{
m_pFeedItemInfo->SetTime(unixtime);
}
m_szTagContent = NULL;
m_iTagContentLen = 0;
}
}
void FeedFile::Parse_Content(const char *buf, int len)
{
m_szTagContent = (char*)realloc(m_szTagContent, m_iTagContentLen + len + 1);
strncpy(m_szTagContent + m_iTagContentLen, buf, len);
m_iTagContentLen += len;
m_szTagContent[m_iTagContentLen] = '\0';
}
void FeedFile::SAX_StartElement(FeedFile* pFile, const char *name, const char **atts)
{
pFile->Parse_StartElement(name, atts);
}
void FeedFile::SAX_EndElement(FeedFile* pFile, const char *name)
{
pFile->Parse_EndElement(name);
}
void FeedFile::SAX_characters(FeedFile* pFile, const char * xmlstr, int len)
{
char* str = (char*)xmlstr;
// trim starting blanks
int off = 0;
for (int i = 0; i < len; i++)
{
char ch = str[i];
if (ch == ' ' || ch == 10 || ch == 13 || ch == 9)
{
off++;
}
else
{
break;
}
}
int newlen = len - off;
// trim ending blanks
for (int i = len - 1; i >= off; i--)
{
char ch = str[i];
if (ch == ' ' || ch == 10 || ch == 13 || ch == 9)
{
newlen--;
}
else
{
break;
}
}
if (newlen > 0)
{
// interpret tag content
pFile->Parse_Content(str + off, newlen);
}
}
void* FeedFile::SAX_getEntity(FeedFile* pFile, const char * name)
{
xmlEntityPtr e = xmlGetPredefinedEntity((xmlChar* )name);
if (!e)
{
warn("entity not found");
pFile->m_bIgnoreNextError = true;
}
return e;
}
void FeedFile::SAX_error(FeedFile* pFile, const char *msg, ...)
{
if (pFile->m_bIgnoreNextError)
{
pFile->m_bIgnoreNextError = false;
return;
}
va_list argp;
va_start(argp, msg);
char szErrMsg[1024];
vsnprintf(szErrMsg, sizeof(szErrMsg), msg, argp);
szErrMsg[1024-1] = '\0';
va_end(argp);
// remove trailing CRLF
for (char* pend = szErrMsg + strlen(szErrMsg) - 1; pend >= szErrMsg && (*pend == '\n' || *pend == '\r' || *pend == ' '); pend--) *pend = '\0';
error("Error parsing rss feed: %s", szErrMsg);
}
#endif