Files
nzbget/daemon/queue/NzbFile.cpp
Andrey Prygunkov 1ee8e02586 #745: fixed crash caused by malformed nzb files
: for file elements not having subject attribute a (somewhat) random
subject is generated and is used as file name; correct file names are
read from articles later anyway
2021-04-15 01:26:36 +02:00

854 lines
20 KiB
C++

/*
* This file is part of nzbget. See <http://nzbget.net>.
*
* Copyright (C) 2004 Sven Henkel <sidddy@users.sourceforge.net>
* Copyright (C) 2007-2016 Andrey Prygunkov <hugbug@users.sourceforge.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "nzbget.h"
#include "NzbFile.h"
#include "Log.h"
#include "DownloadInfo.h"
#include "Options.h"
#include "DiskState.h"
#include "Util.h"
#include "FileSystem.h"
NzbFile::NzbFile(const char* fileName, const char* category) :
m_fileName(fileName)
{
debug("Creating NZBFile");
m_nzbInfo = std::make_unique<NzbInfo>();
m_nzbInfo->SetFilename(fileName);
m_nzbInfo->SetCategory(category);
m_nzbInfo->BuildDestDirName();
}
void NzbFile::LogDebugInfo()
{
info(" NZBFile %s", *m_fileName);
}
void NzbFile::AddArticle(FileInfo* fileInfo, std::unique_ptr<ArticleInfo> articleInfo)
{
int index = articleInfo->GetPartNumber() - 1;
// make Article-List big enough
if (index >= (int)fileInfo->GetArticles()->size())
{
fileInfo->GetArticles()->resize(index + 1);
}
(*fileInfo->GetArticles())[index] = std::move(articleInfo);
}
void NzbFile::AddFileInfo(std::unique_ptr<FileInfo> fileInfo)
{
// calculate file size and delete empty articles
int64 size = 0;
int64 missedSize = 0;
int64 oneSize = 0;
int uncountedArticles = 0;
int missedArticles = 0;
int totalArticles = (int)fileInfo->GetArticles()->size();
int i = 0;
for (ArticleList::iterator it = fileInfo->GetArticles()->begin(); it != fileInfo->GetArticles()->end(); )
{
ArticleInfo* article = (*it).get();
if (!article)
{
fileInfo->GetArticles()->erase(it);
it = fileInfo->GetArticles()->begin() + i;
missedArticles++;
if (oneSize > 0)
{
missedSize += oneSize;
}
else
{
uncountedArticles++;
}
}
else
{
size += article->GetSize();
if (oneSize == 0)
{
oneSize = article->GetSize();
}
it++;
i++;
}
}
if (fileInfo->GetArticles()->empty())
{
return;
}
missedSize += uncountedArticles * oneSize;
size += missedSize;
fileInfo->SetNzbInfo(m_nzbInfo.get());
fileInfo->SetSize(size);
fileInfo->SetRemainingSize(size - missedSize);
fileInfo->SetMissedSize(missedSize);
fileInfo->SetTotalArticles(totalArticles);
fileInfo->SetMissedArticles(missedArticles);
m_nzbInfo->GetFileList()->Add(std::move(fileInfo));
}
void NzbFile::ParseSubject(FileInfo* fileInfo, bool TryQuotes)
{
// Example subject: some garbage "title" yEnc (10/99)
if (!fileInfo->GetSubject())
{
// Malformed file element without subject. We generate subject using internal element id.
fileInfo->SetSubject(CString::FormatStr("%d", fileInfo->GetId()));
}
// strip the "yEnc (10/99)"-suffix
BString<1024> subject = fileInfo->GetSubject();
char* end = subject + strlen(subject) - 1;
if (*end == ')')
{
end--;
while (strchr("0123456789", *end) && end > subject) end--;
if (*end == '/')
{
end--;
while (strchr("0123456789", *end) && end > subject) end--;
if (end - 6 > subject && !strncmp(end - 6, " yEnc (", 7))
{
end[-6] = '\0';
}
}
}
if (TryQuotes)
{
// try to use the filename in quatation marks
char* p = subject;
char* start = strchr(p, '\"');
if (start)
{
start++;
char* end = strchr(start + 1, '\"');
if (end)
{
int len = (int)(end - start);
char* point = strchr(start + 1, '.');
if (point && point < end)
{
BString<1024> filename;
filename.Set(start, len);
fileInfo->SetFilename(filename);
return;
}
}
}
}
// tokenize subject, considering spaces as separators and quotation
// marks as non separatable token delimiters.
// then take the last token containing dot (".") as a filename
typedef std::vector<CString> TokenList;
TokenList tokens;
// tokenizing
char* p = subject;
char* start = p;
bool quot = false;
while (true)
{
char ch = *p;
bool sep = (ch == '\"') || (!quot && ch == ' ') || (ch == '\0');
if (sep)
{
// end of token
int len = (int)(p - start);
if (len > 0)
{
tokens.emplace_back(start, len);
}
start = p;
if (ch != '\"' || quot)
{
start++;
}
quot = *start == '\"';
if (quot)
{
start++;
char* q = strchr(start, '\"');
if (q)
{
p = q - 1;
}
else
{
quot = false;
}
}
}
if (ch == '\0')
{
break;
}
p++;
}
if (!tokens.empty())
{
// finding the best candidate for being a filename
char* besttoken = tokens.back();
for (TokenList::reverse_iterator it = tokens.rbegin(); it != tokens.rend(); it++)
{
char* s = *it;
char* p = strchr(s, '.');
if (p && (p[1] != '\0'))
{
besttoken = s;
break;
}
}
fileInfo->SetFilename(besttoken);
}
else
{
// subject is empty or contains only separators?
debug("Could not extract Filename from Subject: %s. Using Subject as Filename", fileInfo->GetSubject());
fileInfo->SetFilename(fileInfo->GetSubject());
}
}
bool NzbFile::HasDuplicateFilenames()
{
for (FileList::iterator it = m_nzbInfo->GetFileList()->begin(); it != m_nzbInfo->GetFileList()->end(); it++)
{
FileInfo* fileInfo1 = (*it).get();
int dupe = 1;
for (FileList::iterator it2 = it + 1; it2 != m_nzbInfo->GetFileList()->end(); it2++)
{
FileInfo* fileInfo2 = (*it2).get();
if (!strcmp(fileInfo1->GetFilename(), fileInfo2->GetFilename()) &&
strcmp(fileInfo1->GetSubject(), fileInfo2->GetSubject()))
{
dupe++;
}
}
// If more than two files have the same parsed filename but different subjects,
// this means, that the parsing was not correct.
// in this case we take subjects as filenames to prevent
// false "duplicate files"-alarm.
// It's Ok for just two files to have the same filename, this is
// an often case by posting-errors to repost bad files
if (dupe > 2 || (dupe == 2 && m_nzbInfo->GetFileList()->size() == 2))
{
return true;
}
}
return false;
}
/**
* Generate filenames from subjects and check if the parsing of subject was correct
*/
void NzbFile::BuildFilenames()
{
for (FileInfo* fileInfo : m_nzbInfo->GetFileList())
{
ParseSubject(fileInfo, true);
}
if (HasDuplicateFilenames())
{
for (FileInfo* fileInfo : m_nzbInfo->GetFileList())
{
ParseSubject(fileInfo, false);
}
}
if (HasDuplicateFilenames())
{
m_nzbInfo->SetManyDupeFiles(true);
for (FileInfo* fileInfo : m_nzbInfo->GetFileList())
{
fileInfo->SetFilename(fileInfo->GetSubject());
}
}
}
void NzbFile::CalcHashes()
{
RawFileList sortedFiles;
for (FileInfo* fileInfo : m_nzbInfo->GetFileList())
{
sortedFiles.push_back(fileInfo);
}
std::sort(sortedFiles.begin(), sortedFiles.end(),
[](FileInfo* first, FileInfo* second)
{
return strcmp(first->GetFilename(), second->GetFilename()) > 0;
});
uint32 fullContentHash = 0;
uint32 filteredContentHash = 0;
int useForFilteredCount = 0;
for (FileInfo* fileInfo : sortedFiles)
{
// check file extension
bool skip = !fileInfo->GetParFile() &&
Util::MatchFileExt(fileInfo->GetFilename(), g_Options->GetParIgnoreExt(), ",;");
for (ArticleInfo* article: fileInfo->GetArticles())
{
int len = strlen(article->GetMessageId());
fullContentHash = Util::HashBJ96(article->GetMessageId(), len, fullContentHash);
if (!skip)
{
filteredContentHash = Util::HashBJ96(article->GetMessageId(), len, filteredContentHash);
useForFilteredCount++;
}
}
}
// if filtered hash is based on less than a half of files - do not use filtered hash at all
if (useForFilteredCount < (int)sortedFiles.size() / 2)
{
filteredContentHash = 0;
}
m_nzbInfo->SetFullContentHash(fullContentHash);
m_nzbInfo->SetFilteredContentHash(filteredContentHash);
}
void NzbFile::ProcessFiles()
{
BuildFilenames();
for (FileInfo* fileInfo : m_nzbInfo->GetFileList())
{
fileInfo->MakeValidFilename();
BString<1024> loFileName = fileInfo->GetFilename();
for (char* p = loFileName; *p; p++) *p = tolower(*p); // convert string to lowercase
bool parFile = strstr(loFileName, ".par2");
m_nzbInfo->SetFileCount(m_nzbInfo->GetFileCount() + 1);
m_nzbInfo->SetTotalArticles(m_nzbInfo->GetTotalArticles() + fileInfo->GetTotalArticles());
m_nzbInfo->SetFailedArticles(m_nzbInfo->GetFailedArticles() + fileInfo->GetMissedArticles());
m_nzbInfo->SetCurrentFailedArticles(m_nzbInfo->GetCurrentFailedArticles() + fileInfo->GetMissedArticles());
m_nzbInfo->SetSize(m_nzbInfo->GetSize() + fileInfo->GetSize());
m_nzbInfo->SetRemainingSize(m_nzbInfo->GetRemainingSize() + fileInfo->GetRemainingSize());
m_nzbInfo->SetFailedSize(m_nzbInfo->GetFailedSize() + fileInfo->GetMissedSize());
m_nzbInfo->SetCurrentFailedSize(m_nzbInfo->GetFailedSize());
fileInfo->SetParFile(parFile);
if (parFile)
{
m_nzbInfo->SetParSize(m_nzbInfo->GetParSize() + fileInfo->GetSize());
m_nzbInfo->SetParFailedSize(m_nzbInfo->GetParFailedSize() + fileInfo->GetMissedSize());
m_nzbInfo->SetParCurrentFailedSize(m_nzbInfo->GetParFailedSize());
m_nzbInfo->SetRemainingParCount(m_nzbInfo->GetRemainingParCount() + 1);
}
}
m_nzbInfo->UpdateMinMaxTime();
CalcHashes();
if (g_Options->GetServerMode())
{
for (FileInfo* fileInfo : m_nzbInfo->GetFileList())
{
g_DiskState->SaveFile(fileInfo);
fileInfo->GetArticles()->clear();
}
}
if (m_password)
{
ReadPassword();
}
}
/**
* Password read using XML-parser may have special characters (such as TAB) stripped.
* This function rereads password directly from file to keep all characters intact.
*/
void NzbFile::ReadPassword()
{
DiskFile file;
if (!file.Open(m_fileName, DiskFile::omRead))
{
return;
}
// obtain file size.
file.Seek(0, DiskFile::soEnd);
int size = (int)file.Position();
file.Seek(0, DiskFile::soSet);
// reading first 4KB of the file
CharBuffer buf(4096);
size = size < 4096 ? size : 4096;
// copy the file into the buffer.
file.Read(buf, size);
file.Close();
buf[size-1] = '\0';
char* metaPassword = strstr(buf, "<meta type=\"password\">");
if (metaPassword)
{
metaPassword += 22; // length of '<meta type="password">'
char* end = strstr(metaPassword, "</meta>");
if (end)
{
*end = '\0';
WebUtil::XmlDecode(metaPassword);
m_password = metaPassword;
}
}
}
#ifdef WIN32
bool NzbFile::Parse()
{
CoInitialize(nullptr);
HRESULT hr;
MSXML::IXMLDOMDocumentPtr doc;
hr = doc.CreateInstance(MSXML::CLSID_DOMDocument);
if (FAILED(hr))
{
return false;
}
// Load the XML document file...
doc->put_resolveExternals(VARIANT_FALSE);
doc->put_validateOnParse(VARIANT_FALSE);
doc->put_async(VARIANT_FALSE);
_variant_t vFilename(*WString(*m_fileName));
// 1. first trying to load via filename without URL-encoding (certain charaters doesn't work when encoded)
VARIANT_BOOL success = doc->load(vFilename);
if (success == VARIANT_FALSE)
{
// 2. now trying filename encoded as URL
char url[2048];
EncodeUrl(m_fileName, url, 2048);
debug("url=\"%s\"", url);
_variant_t vUrl(url);
success = doc->load(vUrl);
}
if (success == VARIANT_FALSE)
{
_bstr_t r(doc->GetparseError()->reason);
const char* errMsg = r;
m_nzbInfo->AddMessage(Message::mkError, BString<1024>("Error parsing nzb-file %s: %s",
FileSystem::BaseFileName(m_fileName), errMsg));
return false;
}
if (!ParseNzb(doc))
{
return false;
}
if (m_nzbInfo->GetFileList()->empty())
{
m_nzbInfo->AddMessage(Message::mkError, BString<1024>(
"Error parsing nzb-file %s: file has no content", FileSystem::BaseFileName(m_fileName)));
return false;
}
ProcessFiles();
return true;
}
void NzbFile::EncodeUrl(const char* filename, char* url, int bufLen)
{
WString widefilename(filename);
char* end = url + bufLen;
for (wchar_t* p = widefilename; *p && url < end - 3; p++)
{
wchar_t ch = *p;
if (('0' <= ch && ch <= '9') ||
('a' <= ch && ch <= 'z') ||
('A' <= ch && ch <= 'Z') ||
ch == '-' || ch == '.' || ch == '_' || ch == '~')
{
*url++ = (char)ch;
}
else
{
*url++ = '%';
uint32 a = (uint32)ch >> 4;
*url++ = a > 9 ? a - 10 + 'A' : a + '0';
a = ch & 0xF;
*url++ = a > 9 ? a - 10 + 'A' : a + '0';
}
}
*url = '\0';
}
bool NzbFile::ParseNzb(IUnknown* nzb)
{
MSXML::IXMLDOMDocumentPtr doc = nzb;
MSXML::IXMLDOMNodePtr root = doc->documentElement;
MSXML::IXMLDOMNodePtr node = root->selectSingleNode("/nzb/head/meta[@type='password']");
if (node)
{
_bstr_t password(node->Gettext());
m_password = password;
}
MSXML::IXMLDOMNodeListPtr fileList = root->selectNodes("/nzb/file");
for (int i = 0; i < fileList->Getlength(); i++)
{
node = fileList->Getitem(i);
MSXML::IXMLDOMNodePtr attribute = node->Getattributes()->getNamedItem("subject");
if (!attribute) return false;
_bstr_t subject(attribute->Gettext());
std::unique_ptr<FileInfo> fileInfo = std::make_unique<FileInfo>();
fileInfo->SetSubject(subject);
attribute = node->Getattributes()->getNamedItem("date");
if (attribute)
{
_bstr_t date(attribute->Gettext());
fileInfo->SetTime(atoi(date));
}
MSXML::IXMLDOMNodeListPtr groupList = node->selectNodes("groups/group");
for (int g = 0; g < groupList->Getlength(); g++)
{
MSXML::IXMLDOMNodePtr node = groupList->Getitem(g);
_bstr_t group = node->Gettext();
fileInfo->GetGroups()->push_back((const char*)group);
}
MSXML::IXMLDOMNodeListPtr segmentList = node->selectNodes("segments/segment");
for (int g = 0; g < segmentList->Getlength(); g++)
{
MSXML::IXMLDOMNodePtr node = segmentList->Getitem(g);
_bstr_t bid = node->Gettext();
BString<1024> id("<%s>", (const char*)bid);
MSXML::IXMLDOMNodePtr attribute = node->Getattributes()->getNamedItem("number");
if (!attribute) return false;
_bstr_t number(attribute->Gettext());
attribute = node->Getattributes()->getNamedItem("bytes");
if (!attribute) return false;
_bstr_t bytes(attribute->Gettext());
int partNumber = atoi(number);
int lsize = atoi(bytes);
if (partNumber > 0)
{
std::unique_ptr<ArticleInfo> article = std::make_unique<ArticleInfo>();
article->SetPartNumber(partNumber);
article->SetMessageId(id);
article->SetSize(lsize);
AddArticle(fileInfo.get(), std::move(article));
}
}
AddFileInfo(std::move(fileInfo));
}
return true;
}
#else
bool NzbFile::Parse()
{
#ifdef DISABLE_LIBXML2
error("Could not parse rss feed, program was compiled without libxml2 support");
return false;
#else
xmlSAXHandler SAX_handler = {0};
SAX_handler.startElement = reinterpret_cast<startElementSAXFunc>(SAX_StartElement);
SAX_handler.endElement = reinterpret_cast<endElementSAXFunc>(SAX_EndElement);
SAX_handler.characters = reinterpret_cast<charactersSAXFunc>(SAX_characters);
SAX_handler.error = reinterpret_cast<errorSAXFunc>(SAX_error);
SAX_handler.getEntity = reinterpret_cast<getEntitySAXFunc>(SAX_getEntity);
m_ignoreNextError = false;
int ret = xmlSAXUserParseFile(&SAX_handler, this, m_fileName);
if (ret != 0)
{
m_nzbInfo->AddMessage(Message::mkError, BString<1024>(
"Error parsing nzb-file %s", FileSystem::BaseFileName(m_fileName)));
return false;
}
if (m_nzbInfo->GetFileList()->empty())
{
m_nzbInfo->AddMessage(Message::mkError, BString<1024>(
"Error parsing nzb-file %s: file has no content", FileSystem::BaseFileName(m_fileName)));
return false;
}
ProcessFiles();
return true;
#endif
}
void NzbFile::Parse_StartElement(const char *name, const char **atts)
{
BString<1024> tagAttrMessage("Malformed nzb-file, tag <%s> must have attributes", name);
m_tagContent.Clear();
if (!strcmp("file", name))
{
m_fileInfo = std::make_unique<FileInfo>();
m_fileInfo->SetFilename(m_fileName);
if (!atts)
{
m_nzbInfo->AddMessage(Message::mkWarning, tagAttrMessage);
return;
}
for (int i = 0; atts[i]; i += 2)
{
const char* attrname = atts[i];
const char* attrvalue = atts[i + 1];
if (!strcmp("subject", attrname))
{
m_fileInfo->SetSubject(attrvalue);
}
if (!strcmp("date", attrname))
{
m_fileInfo->SetTime(atoi(attrvalue));
}
}
}
else if (!strcmp("segment", name))
{
if (!m_fileInfo)
{
m_nzbInfo->AddMessage(Message::mkWarning, "Malformed nzb-file, tag <segment> without tag <file>");
return;
}
if (!atts)
{
m_nzbInfo->AddMessage(Message::mkWarning, tagAttrMessage);
return;
}
int64 lsize = -1;
int partNumber = -1;
for (int i = 0; atts[i]; i += 2)
{
const char* attrname = atts[i];
const char* attrvalue = atts[i + 1];
if (!strcmp("bytes", attrname))
{
lsize = atol(attrvalue);
}
if (!strcmp("number", attrname))
{
partNumber = atol(attrvalue);
}
}
if (partNumber > 0)
{
// new segment, add it!
std::unique_ptr<ArticleInfo> article = std::make_unique<ArticleInfo>();
article->SetPartNumber(partNumber);
article->SetSize(lsize);
m_article = article.get();
AddArticle(m_fileInfo.get(), std::move(article));
}
}
else if (!strcmp("meta", name))
{
if (!atts)
{
m_nzbInfo->AddMessage(Message::mkWarning, tagAttrMessage);
return;
}
m_hasPassword = atts[0] && atts[1] && !strcmp("type", atts[0]) && !strcmp("password", atts[1]);
}
}
void NzbFile::Parse_EndElement(const char *name)
{
if (!strcmp("file", name))
{
// Close the file element, add the new file to file-list
AddFileInfo(std::move(m_fileInfo));
m_article = nullptr;
}
else if (!strcmp("group", name))
{
if (!m_fileInfo)
{
// error: bad nzb-file
return;
}
m_fileInfo->GetGroups()->push_back(*m_tagContent);
m_tagContent.Clear();
}
else if (!strcmp("segment", name))
{
if (!m_fileInfo || !m_article)
{
// error: bad nzb-file
return;
}
// Get the #text part
BString<1024> id("<%s>", *m_tagContent);
m_article->SetMessageId(id);
m_article = nullptr;
}
else if (!strcmp("meta", name) && m_hasPassword)
{
m_password = m_tagContent;
}
}
void NzbFile::Parse_Content(const char *buf, int len)
{
m_tagContent.Append(buf, len);
}
void NzbFile::SAX_StartElement(NzbFile* file, const char *name, const char **atts)
{
file->Parse_StartElement(name, atts);
}
void NzbFile::SAX_EndElement(NzbFile* file, const char *name)
{
file->Parse_EndElement(name);
}
void NzbFile::SAX_characters(NzbFile* file, const char * xmlstr, int len)
{
char* str = (char*)xmlstr;
// trim starting blanks
int off = 0;
for (int i = 0; i < len; i++)
{
char ch = str[i];
if (ch == ' ' || ch == 10 || ch == 13 || ch == 9)
{
off++;
}
else
{
break;
}
}
int newlen = len - off;
// trim ending blanks
for (int i = len - 1; i >= off; i--)
{
char ch = str[i];
if (ch == ' ' || ch == 10 || ch == 13 || ch == 9)
{
newlen--;
}
else
{
break;
}
}
if (newlen > 0)
{
// interpret tag content
file->Parse_Content(str + off, newlen);
}
}
void* NzbFile::SAX_getEntity(NzbFile* file, const char * name)
{
#ifdef DISABLE_LIBXML2
void* e = nullptr;
#else
xmlEntityPtr e = xmlGetPredefinedEntity((xmlChar* )name);
#endif
if (!e)
{
file->m_nzbInfo->AddMessage(Message::mkWarning, "entity not found");
file->m_ignoreNextError = true;
}
return e;
}
void NzbFile::SAX_error(NzbFile* file, const char *msg, ...)
{
if (file->m_ignoreNextError)
{
file->m_ignoreNextError = false;
return;
}
va_list argp;
va_start(argp, msg);
char errMsg[1024];
vsnprintf(errMsg, sizeof(errMsg), msg, argp);
errMsg[1024-1] = '\0';
va_end(argp);
// remove trailing CRLF
for (char* pend = errMsg + strlen(errMsg) - 1; pend >= errMsg && (*pend == '\n' || *pend == '\r' || *pend == ' '); pend--) *pend = '\0';
file->m_nzbInfo->AddMessage(Message::mkError, BString<1024>("Error parsing nzb-file: %s", errMsg));
}
#endif