#include "HTMLGeneralParser.h"
#include "InternetResource.h"
#include "HTMLPage.h"
#include "TIPsDatabase.h"
Filter HTMLGeneralParser::fSummary(0, 0, "(H[0-9]a)+[^H]*V[0-9]a?(V[0-9]a?|T[0-9]a?|Ia?|Aa?)*", "[\\0]", 0, "linked headings with valid texts", regex::NOCASE, false);
Filter HTMLGeneralParser::fArticle(0, 0, "(H[0-9])+[^H]*V[0-9]a?(V[0-9]a?|T[0-9]a?|Ia?|Aa?)*", "[\\0]", 0, "plain headings with valid texts", regex::NOCASE, false);
Filter HTMLGeneralParser::fTitle( 0, 0, "Z[^A-Z ]*", "[\\0]", 0, "page title(s)", regex::NOCASE, false);
HTMLGeneralParser::HTMLGeneralParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity):
HTMLParser(_pagegroupid, _db, _domain, _type, _minimumByteDensity) {}
const int HTMLGeneralParser::parsertype() const {return DBF_PARSERTYPE_HTMLGENERALPARSER;}
const size_t HTMLGeneralParser::parse(const InternetResource *ir, vector *objects) const {
//this parsing process requires many diverse custom rules
//that take effect at different stages
//thus there are several parsing techniques employed all over the place
const char *body = ir->body();
size_t bodysize = ir->size();
if (!bodysize) bodysize = strlen(body);
#ifdef _DEBUG
if (m_domain != ir->internetURIRequest()->domain()) {
DEBUGERROR0("[HTMLArticleParser]: DomainMismatch()");
throw DomainMismatch();
}
#endif
//chunks of stuff
vector chunks; chunks.reserve( bodysize / 100);
vector selectedzones; selectedzones.reserve(bodysize / 50000 + 10);
vector entityids; entityids.reserve(20);
char *codestring; //represents the document in letter codes e.g. H1T4Il = heading 1, Text > 40 words, Linked Image
//const char *newbody; //reconstructed resultant body taken from the document
const char *title; //title of the document
//document level averages, counts and maximums
/*
const HTMLTag *maxHeadingTag;
unsigned int numHeadings;
unsigned int totalWordCount,
totalChunksSize,
totalAlphaNumerics,
totalBytesSize,
maxDepth;
PERCENTAGE avgTextHTMLDensity;
PERCENTAGE avgAlphaNumeric;
unsigned int avgWordDensity;
*/
//processing path:
//parse the HTML into text only chunks and also related in-line continuous markup/text areas
//and calculate text-chunk level statistics
//this is the only function that accesses the body,
//the rest works of the text chunks and areas
getTextChunks(body, bodysize, &chunks); //output: chunks, markupareas, chunkformats
//calculate document averages over all text chunks and HTML in-line areas
/*
calcAggregates(&chunks,
//(passed by reference by the function definition)
totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize,
maxDepth, &maxHeadingTag, numHeadings,
avgTextHTMLDensity, avgAlphaNumeric, avgWordDensity
);
*/
//Generate the encoded string representation of the chunk types e.g. H1T4Il = heading 1, Text > 40 words, Linked Image
generateCodeString(&chunks, &codestring); //HTMLParser:: output: codestring (regex chunk selection system)
//find summaries only for future linking in with articles
//NOT IMPLEMENTED YET
/*
selectChunks(codestring, &fSummary, &selectedzones);
for (vector::const_iterator i = selectedzones.begin(); i != selectedzones.end(); i++) {
const selectedzone& sz = *i;
newbody = selectedzoneToHTML(&chunks, sz);
if (newbody) objects->push_back(new Summary(m_db, ir, "title", newbody)); //object frees body
}
selectedzones.clear();
*/
//title
selectChunks(codestring, &fTitle, &selectedzones); //HTMLParser:: select zones using codestring regex system
title = selectedzoneText(&chunks, &selectedzones); //select just the title and malloc it
selectedzones.clear();
//find all relevant text (including summaries again probably)
//this is to construct a final document
//selectedzones will be consecutive e.g. 1-3, 4-7, 10-45 no overlap
//there are all concatenated togther to form a final document
selectChunks(codestring, &fArticle, &selectedzones); //HTMLParser:: select zones using codestring regex system
//reuse the body of the IR because the result will be smaller
/*
if (newbody = selectedzonesToHTML(&chunks, &selectedzones, (char*)body, bodysize)) //HTMLParser:: concatenates all selected zones together
if (searchForEntities(newbody, &entityids)) //Parser:: returns number of discovered entities
objects->push_back( //add this to the list of objects to save to the DB
//Article strdup's title and summary, but no the body
new Article(m_db, ir, entityids, title, newbody)
); //object frees body when finished with it
*/
free((void*)codestring);
free((void*)title);
//write a new HTML document with all debug in
//caller responsible for freeing newbody
//const char *debug = debugbody(&chunks, &selectedzones, body, bodysize);
//free((void*)debug);
return objects->size();
}