#include "HTMLArticleParser.h" #include "InternetResource.h" #include "TIPsDatabase.h" Filter HTMLArticleParser::fArticle(0, 0, "(H[0-9])+[^H]*V[0-9]a?(V[0-9]a?|T[0-9]a?|Ia?|Aa?)*", "[\\0]", 0, "plain headings with valid texts", regex::NOCASE, false); Filter HTMLArticleParser::fTitle( 0, 0, "Z[^A-Z ]*", "[\\0]", 0, "page title(s)", regex::NOCASE, false); Filter HTMLArticleParser::fDescription( 0, 0, "D[^A-Z ]*", "[\\0]", 0, "page description(s)", regex::NOCASE, false); Filter HTMLArticleParser::fKeywords( 0, 0, "K[^A-Z ]*", "[\\0]", 0, "page tags(s)", regex::NOCASE, false); HTMLArticleParser::HTMLArticleParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity): HTMLParser(_pagegroupid, _db, _domain, _type, _minimumByteDensity) {} const int HTMLArticleParser::parsertype() const {return DBF_PARSERTYPE_HTMLARTICLEPARSER;} const size_t HTMLArticleParser::parse(const InternetResource *ir, vector *objects) const { //this parsing process requires many diverse custom rules //that take effect at different stages //thus there are several parsing techniques employed all over the place const char *body = ir->body(); size_t bodysize = ir->size(); if (!bodysize) bodysize = strlen(body); #ifdef _DEBUG if (m_domain != ir->internetURIRequest()->domain()) { DEBUGERROR0("[HTMLArticleParser]: DomainMismatch()"); throw DomainMismatch(); } #endif //chunks of stuff vector chunks; chunks.reserve( bodysize / 100); vector selectedzones; selectedzones.reserve(bodysize / 50000 + 10); vector entityids; entityids.reserve(20); vector tags; tags.reserve(10); char *codestring; //represents the document in letter codes e.g. H1T4Il = heading 1, Text > 40 words, Linked Image const char *newbody; //reconstructed resultant body taken from the document const char *title, //title of the document *description, //META description *keywords; //META Keywords char newbodyMD5[33];//md5 of the body of the IR //document level averages, counts and maximums /* const HTMLTag *maxHeadingTag; unsigned int numHeadings; unsigned int totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize, maxDepth; PERCENTAGE avgTextHTMLDensity; PERCENTAGE avgAlphaNumeric; unsigned int avgWordDensity; */ //processing path: //parse the HTML into text only chunks and also related in-line continuous markup/text areas //and calculate text-chunk level statistics //this is the only function that accesses the body, //the rest works of the text chunks and areas getTextChunks(body, bodysize, &chunks); //output: chunks, markupareas, chunkformats //calculate document averages over all text chunks and HTML in-line areas /* calcAggregates(&chunks, //(passed by reference by the function definition) totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize, maxDepth, &maxHeadingTag, numHeadings, avgTextHTMLDensity, avgAlphaNumeric, avgWordDensity ); */ //Generate the encoded string representation of the chunk types e.g. H1T4Il = heading 1, Text > 40 words, Linked Image //HTMLParser:: output: codestring (regex chunk selection system) generateCodeString(&chunks, &codestring); //title, description and keywords //HTMLParser:: select zones using codestring regex system, select and malloc it //do this here because the body will get overwritten later selectChunks(codestring, &fTitle, &selectedzones); title = selectedzoneText(&chunks, &selectedzones); //example title selectedzones.clear(); selectChunks(codestring, &fDescription, &selectedzones); description = selectedzoneText(&chunks, &selectedzones); // selectedzones.clear(); selectChunks(codestring, &fKeywords, &selectedzones); keywords = selectedzoneText(&chunks, &selectedzones); // selectedzones.clear(); //find all relevant text (including summaries again probably) //this is to construct a final document //selectedzones will be consecutive e.g. 1-3, 4-7, 10-45 no overlap //there are all concatenated togther to form a final document //HTMLParser:: select zones using codestring regex system selectChunks(codestring, &fArticle, &selectedzones); //HTMLParser:: concatenates all selected zones together //could reuse the body of the IR because the result will be smaller but want the origonal html also //thus ask the function to malloc a new area if (newbody = selectedzonesToHTML(&chunks, &selectedzones)) { selectedzones.clear(); if (searchForEntities(newbody, &entityids)) { //Parser:: returns number of discovered entities //now we know we have a document with company names in so do all the time consuming stuff: //regex -> tag normalisation if (keywords) split(keywords, &tags); //tags: vector: blah, blah, and another //the following is handled by the database now (part of the whole new categorisation scheme) //searchForTagPhrases(newbody, &tags); //tags: vector: example tag, another tag, taggy, tag, tag md5(newbody, newbodyMD5); if (m_md5s.find(newbodyMD5) == m_md5s.end()) objects->push_back( //add this to the list of objects to save to the DB //Article strdup's all inputs //object frees all its own mallocs in ~Article() //entityids and tags vectors are copied by value new Article(m_db, ir, entityids, title, newbody, body, tags, description) ); else { //the md5 was found in the list of current MD5s from the DB //so ignore this document DEBUGPRINT("[%s]: MD5 hit (%s)", DEBUG_LINE, ir->domain(), newbodyMD5); ir->internetURIRequest()->domain()->addMD5Hit(ir->internetURIRequest()); } } } //free up if (newbody) free((void*)newbody); if (title) free((void*)title); if (description) free((void*)description); if (keywords) free((void*)keywords); if (codestring) free((void*)codestring); //write a new HTML document with all debug in //caller responsible for freeing newbody //const char *debug = debugbody(&chunks, &selectedzones, body, bodysize); //free((void*)debug); return objects->size(); }