#include "HTMLGeneralParser.h" #include "InternetResource.h" #include "HTMLPage.h" #include "TIPsDatabase.h" Filter HTMLGeneralParser::fSummary(0, 0, "(H[0-9]a)+[^H]*V[0-9]a?(V[0-9]a?|T[0-9]a?|Ia?|Aa?)*", "[\\0]", 0, "linked headings with valid texts", regex::NOCASE, false); Filter HTMLGeneralParser::fArticle(0, 0, "(H[0-9])+[^H]*V[0-9]a?(V[0-9]a?|T[0-9]a?|Ia?|Aa?)*", "[\\0]", 0, "plain headings with valid texts", regex::NOCASE, false); Filter HTMLGeneralParser::fTitle( 0, 0, "Z[^A-Z ]*", "[\\0]", 0, "page title(s)", regex::NOCASE, false); HTMLGeneralParser::HTMLGeneralParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity): HTMLParser(_pagegroupid, _db, _domain, _type, _minimumByteDensity) {} const int HTMLGeneralParser::parsertype() const {return DBF_PARSERTYPE_HTMLGENERALPARSER;} const size_t HTMLGeneralParser::parse(const InternetResource *ir, vector *objects) const { //this parsing process requires many diverse custom rules //that take effect at different stages //thus there are several parsing techniques employed all over the place const char *body = ir->body(); size_t bodysize = ir->size(); if (!bodysize) bodysize = strlen(body); #ifdef _DEBUG if (m_domain != ir->internetURIRequest()->domain()) { DEBUGERROR0("[HTMLArticleParser]: DomainMismatch()"); throw DomainMismatch(); } #endif //chunks of stuff vector chunks; chunks.reserve( bodysize / 100); vector selectedzones; selectedzones.reserve(bodysize / 50000 + 10); vector entityids; entityids.reserve(20); char *codestring; //represents the document in letter codes e.g. H1T4Il = heading 1, Text > 40 words, Linked Image //const char *newbody; //reconstructed resultant body taken from the document const char *title; //title of the document //document level averages, counts and maximums /* const HTMLTag *maxHeadingTag; unsigned int numHeadings; unsigned int totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize, maxDepth; PERCENTAGE avgTextHTMLDensity; PERCENTAGE avgAlphaNumeric; unsigned int avgWordDensity; */ //processing path: //parse the HTML into text only chunks and also related in-line continuous markup/text areas //and calculate text-chunk level statistics //this is the only function that accesses the body, //the rest works of the text chunks and areas getTextChunks(body, bodysize, &chunks); //output: chunks, markupareas, chunkformats //calculate document averages over all text chunks and HTML in-line areas /* calcAggregates(&chunks, //(passed by reference by the function definition) totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize, maxDepth, &maxHeadingTag, numHeadings, avgTextHTMLDensity, avgAlphaNumeric, avgWordDensity ); */ //Generate the encoded string representation of the chunk types e.g. H1T4Il = heading 1, Text > 40 words, Linked Image generateCodeString(&chunks, &codestring); //HTMLParser:: output: codestring (regex chunk selection system) //find summaries only for future linking in with articles //NOT IMPLEMENTED YET /* selectChunks(codestring, &fSummary, &selectedzones); for (vector::const_iterator i = selectedzones.begin(); i != selectedzones.end(); i++) { const selectedzone& sz = *i; newbody = selectedzoneToHTML(&chunks, sz); if (newbody) objects->push_back(new Summary(m_db, ir, "title", newbody)); //object frees body } selectedzones.clear(); */ //title selectChunks(codestring, &fTitle, &selectedzones); //HTMLParser:: select zones using codestring regex system title = selectedzoneText(&chunks, &selectedzones); //select just the title and malloc it selectedzones.clear(); //find all relevant text (including summaries again probably) //this is to construct a final document //selectedzones will be consecutive e.g. 1-3, 4-7, 10-45 no overlap //there are all concatenated togther to form a final document selectChunks(codestring, &fArticle, &selectedzones); //HTMLParser:: select zones using codestring regex system //reuse the body of the IR because the result will be smaller /* if (newbody = selectedzonesToHTML(&chunks, &selectedzones, (char*)body, bodysize)) //HTMLParser:: concatenates all selected zones together if (searchForEntities(newbody, &entityids)) //Parser:: returns number of discovered entities objects->push_back( //add this to the list of objects to save to the DB //Article strdup's title and summary, but no the body new Article(m_db, ir, entityids, title, newbody) ); //object frees body when finished with it */ free((void*)codestring); free((void*)title); //write a new HTML document with all debug in //caller responsible for freeing newbody //const char *debug = debugbody(&chunks, &selectedzones, body, bodysize); //free((void*)debug); return objects->size(); }