#include "HTMLArticleParser.h"
#include "InternetResource.h"
#include "TIPsDatabase.h"
Filter HTMLArticleParser::fArticle(0, 0, "(H[0-9])+[^H]*V[0-9]a?(V[0-9]a?|T[0-9]a?|Ia?|Aa?)*", "[\\0]", 0, "plain headings with valid texts", regex::NOCASE, false);
Filter HTMLArticleParser::fTitle( 0, 0, "Z[^A-Z ]*", "[\\0]", 0, "page title(s)", regex::NOCASE, false);
Filter HTMLArticleParser::fDescription( 0, 0, "D[^A-Z ]*", "[\\0]", 0, "page description(s)", regex::NOCASE, false);
Filter HTMLArticleParser::fKeywords( 0, 0, "K[^A-Z ]*", "[\\0]", 0, "page tags(s)", regex::NOCASE, false);
HTMLArticleParser::HTMLArticleParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity):
HTMLParser(_pagegroupid, _db, _domain, _type, _minimumByteDensity) {}
const int HTMLArticleParser::parsertype() const {return DBF_PARSERTYPE_HTMLARTICLEPARSER;}
const size_t HTMLArticleParser::parse(const InternetResource *ir, vector *objects) const {
//this parsing process requires many diverse custom rules
//that take effect at different stages
//thus there are several parsing techniques employed all over the place
const char *body = ir->body();
size_t bodysize = ir->size();
if (!bodysize) bodysize = strlen(body);
#ifdef _DEBUG
if (m_domain != ir->internetURIRequest()->domain()) {
DEBUGERROR0("[HTMLArticleParser]: DomainMismatch()");
throw DomainMismatch();
}
#endif
//chunks of stuff
vector chunks; chunks.reserve( bodysize / 100);
vector selectedzones; selectedzones.reserve(bodysize / 50000 + 10);
vector entityids; entityids.reserve(20);
vector tags; tags.reserve(10);
char *codestring; //represents the document in letter codes e.g. H1T4Il = heading 1, Text > 40 words, Linked Image
const char *newbody; //reconstructed resultant body taken from the document
const char *title, //title of the document
*description, //META description
*keywords; //META Keywords
char newbodyMD5[33];//md5 of the body of the IR
//document level averages, counts and maximums
/*
const HTMLTag *maxHeadingTag;
unsigned int numHeadings;
unsigned int totalWordCount,
totalChunksSize,
totalAlphaNumerics,
totalBytesSize,
maxDepth;
PERCENTAGE avgTextHTMLDensity;
PERCENTAGE avgAlphaNumeric;
unsigned int avgWordDensity;
*/
//processing path:
//parse the HTML into text only chunks and also related in-line continuous markup/text areas
//and calculate text-chunk level statistics
//this is the only function that accesses the body,
//the rest works of the text chunks and areas
getTextChunks(body, bodysize, &chunks); //output: chunks, markupareas, chunkformats
//calculate document averages over all text chunks and HTML in-line areas
/*
calcAggregates(&chunks,
//(passed by reference by the function definition)
totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize,
maxDepth, &maxHeadingTag, numHeadings,
avgTextHTMLDensity, avgAlphaNumeric, avgWordDensity
);
*/
//Generate the encoded string representation of the chunk types e.g. H1T4Il = heading 1, Text > 40 words, Linked Image
//HTMLParser:: output: codestring (regex chunk selection system)
generateCodeString(&chunks, &codestring);
//title, description and keywords
//HTMLParser:: select zones using codestring regex system, select and malloc it
//do this here because the body will get overwritten later
selectChunks(codestring, &fTitle, &selectedzones);
title = selectedzoneText(&chunks, &selectedzones); //example title
selectedzones.clear();
selectChunks(codestring, &fDescription, &selectedzones);
description = selectedzoneText(&chunks, &selectedzones); //
selectedzones.clear();
selectChunks(codestring, &fKeywords, &selectedzones);
keywords = selectedzoneText(&chunks, &selectedzones); //
selectedzones.clear();
//find all relevant text (including summaries again probably)
//this is to construct a final document
//selectedzones will be consecutive e.g. 1-3, 4-7, 10-45 no overlap
//there are all concatenated togther to form a final document
//HTMLParser:: select zones using codestring regex system
selectChunks(codestring, &fArticle, &selectedzones);
//HTMLParser:: concatenates all selected zones together
//could reuse the body of the IR because the result will be smaller but want the origonal html also
//thus ask the function to malloc a new area
if (newbody = selectedzonesToHTML(&chunks, &selectedzones)) {
selectedzones.clear();
if (searchForEntities(newbody, &entityids)) { //Parser:: returns number of discovered entities
//now we know we have a document with company names in so do all the time consuming stuff:
//regex -> tag normalisation
if (keywords) split(keywords, &tags); //tags: vector: blah, blah, and another
//the following is handled by the database now (part of the whole new categorisation scheme)
//searchForTagPhrases(newbody, &tags); //tags: vector: example tag, another tag, taggy, tag, tag
md5(newbody, newbodyMD5);
if (m_md5s.find(newbodyMD5) == m_md5s.end())
objects->push_back( //add this to the list of objects to save to the DB
//Article strdup's all inputs
//object frees all its own mallocs in ~Article()
//entityids and tags vectors are copied by value
new Article(m_db, ir, entityids, title, newbody, body, tags, description)
);
else {
//the md5 was found in the list of current MD5s from the DB
//so ignore this document
DEBUGPRINT("[%s]: MD5 hit (%s)", DEBUG_LINE, ir->domain(), newbodyMD5);
ir->internetURIRequest()->domain()->addMD5Hit(ir->internetURIRequest());
}
}
}
//free up
if (newbody) free((void*)newbody);
if (title) free((void*)title);
if (description) free((void*)description);
if (keywords) free((void*)keywords);
if (codestring) free((void*)codestring);
//write a new HTML document with all debug in
//caller responsible for freeing newbody
//const char *debug = debugbody(&chunks, &selectedzones, body, bodysize);
//free((void*)debug);
return objects->size();
}