#ifndef _HTMLPARSER_H #define _HTMLPARSER_H class Domain; #include "Parser.h" #include "extensions.h" #include "StringMap.h" #include "FilterGroup.h" #include "HTMLTag.h" #include "HTMLEntity.h" #include "HTMLAttribute.h" #include #include using namespace std; class HTMLParser: public Parser { public: struct cssinstance { HTMLTag *t; //these are all pointers into the document //the strings are not copied or zero terminated //their start and finishes are simply remembered (8 bytes per string) const char *idStart; const char *idFinish; const char *classnameStart; const char *classnameFinish; const char *styleStart; const char *styleFinish; }; //exceptions class BodyOverflow {}; protected: //---------------------------------------------------- text chunk analysis classes enum chunktype { chunk_none = 0, chunk_text, //text only (bold, italic, maybe a heading as well) chunk_image, //image src chunk_alt, //alt details chunk_description, //META content chunk_keywords, //META content chunk_title //Page title text }; //attribute combinations enum tagmodifier { noTagModifer = 0, metaDescription, metaKeywords }; //a TextChunk has NO HTML markup. Just text. struct TextChunk { chunktype m_type; //text/image src/img alt/meta content const char *m_start; //references in source document const char *m_finish; //not zero terminated //statistical properties size_t m_bytes; unsigned int m_wordCount; unsigned int m_alphanumerics; unsigned int m_grammars; unsigned int m_sentences; //structure tags HTMLTag *m_headingTag; //h1 - 6 or font-br, b-br combos const char *m_hrefStart; //start finish pair: the string is not copied, just referenced in the document const char *m_hrefFinish; //not zero terminated //placement info size_t m_markupAreaID; //the associated continuous in-line markup area size_t m_depth; //tag depth vector m_csspath; //*copy* of css instances vector //calculated density stats size_t textLength() const {return m_finish - m_start;} PERCENTAGE textDensity() const {return m_bytes ? textLength() * 100 / m_bytes : 0;} unsigned int wordDensity() const {return m_bytes ? m_wordCount * 1000 / m_bytes : 0;} PERCENTAGE a1Density() const {return m_bytes ? m_alphanumerics * 100 / m_bytes : 0;} PERCENTAGE grammarDensity() const {return m_wordCount ? m_grammars * 100 / m_wordCount : 0;} }; struct selectedzone { size_t firstChunk; //the index of the first TextChunk size_t lastChunk; //the index of the last TextChunk }; const short m_minimumByteDensity; HTMLParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity = 50); //all re-entrant and thread safe as Parsers can be shared bool accepts(const InternetResource *ir) const; char *chunkcode(const TextChunk &tc, char *pos) const; const size_t getTextChunks(const char *body, size_t bodysize, vector *chunks) const; const size_t calcAggregates(vector *chunks, unsigned int &totalWordCount, unsigned int &totalChunksSize, unsigned int &totalAlphaNumerics, unsigned int &totalBytesSize, unsigned int &maxDepth, const HTMLTag **maxHeadingTag, unsigned int &numHeadings, PERCENTAGE &avgTextHTMLDensity, PERCENTAGE &avgAlphaNumeric, unsigned int &avgWordDensity) const; const size_t generateCodeString(const vector *chunks, char **rcodestring) const; const size_t selectChunks(const char *codestring, Filter *f, vector *v) const; const char *selectedzoneText( const vector *chunks, const vector *selectedzones) const; //caller frees result const char *selectedzonesToHTML(const vector *chunks, const vector *selectedzones, char *newbody = 0, const size_t size = 0) const; //caller frees result if newbody = 0 const char *debugbody(vector *chunks, vector *selectedzones, const char *body, const size_t bodysize) const; }; #endif