#ifndef _HTMLPARSER_H
#define _HTMLPARSER_H
class Domain;
#include "Parser.h"
#include "extensions.h"
#include "StringMap.h"
#include "FilterGroup.h"
#include "HTMLTag.h"
#include "HTMLEntity.h"
#include "HTMLAttribute.h"
#include
#include
using namespace std;
class HTMLParser: public Parser {
public:
struct cssinstance {
HTMLTag *t;
//these are all pointers into the document
//the strings are not copied or zero terminated
//their start and finishes are simply remembered (8 bytes per string)
const char *idStart;
const char *idFinish;
const char *classnameStart;
const char *classnameFinish;
const char *styleStart;
const char *styleFinish;
};
//exceptions
class BodyOverflow {};
protected:
//---------------------------------------------------- text chunk analysis classes
enum chunktype {
chunk_none = 0,
chunk_text, //text only (bold, italic, maybe a heading as well)
chunk_image, //image src
chunk_alt, //alt details
chunk_description, //META content
chunk_keywords, //META content
chunk_title //Page title text
};
//attribute combinations
enum tagmodifier {
noTagModifer = 0,
metaDescription,
metaKeywords
};
//a TextChunk has NO HTML markup. Just text.
struct TextChunk {
chunktype m_type; //text/image src/img alt/meta content
const char *m_start; //references in source document
const char *m_finish; //not zero terminated
//statistical properties
size_t m_bytes;
unsigned int m_wordCount;
unsigned int m_alphanumerics;
unsigned int m_grammars;
unsigned int m_sentences;
//structure tags
HTMLTag *m_headingTag; //h1 - 6 or font-br, b-br combos
const char *m_hrefStart; //start finish pair: the string is not copied, just referenced in the document
const char *m_hrefFinish; //not zero terminated
//placement info
size_t m_markupAreaID; //the associated continuous in-line markup area
size_t m_depth; //tag depth
vector m_csspath; //*copy* of css instances vector
//calculated density stats
size_t textLength() const {return m_finish - m_start;}
PERCENTAGE textDensity() const {return m_bytes ? textLength() * 100 / m_bytes : 0;}
unsigned int wordDensity() const {return m_bytes ? m_wordCount * 1000 / m_bytes : 0;}
PERCENTAGE a1Density() const {return m_bytes ? m_alphanumerics * 100 / m_bytes : 0;}
PERCENTAGE grammarDensity() const {return m_wordCount ? m_grammars * 100 / m_wordCount : 0;}
};
struct selectedzone {
size_t firstChunk; //the index of the first TextChunk
size_t lastChunk; //the index of the last TextChunk
};
const short m_minimumByteDensity;
HTMLParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity = 50);
//all re-entrant and thread safe as Parsers can be shared
bool accepts(const InternetResource *ir) const;
char *chunkcode(const TextChunk &tc, char *pos) const;
const size_t getTextChunks(const char *body, size_t bodysize, vector *chunks) const;
const size_t calcAggregates(vector *chunks, unsigned int &totalWordCount, unsigned int &totalChunksSize, unsigned int &totalAlphaNumerics, unsigned int &totalBytesSize, unsigned int &maxDepth, const HTMLTag **maxHeadingTag, unsigned int &numHeadings, PERCENTAGE &avgTextHTMLDensity, PERCENTAGE &avgAlphaNumeric, unsigned int &avgWordDensity) const;
const size_t generateCodeString(const vector *chunks, char **rcodestring) const;
const size_t selectChunks(const char *codestring, Filter *f, vector *v) const;
const char *selectedzoneText( const vector *chunks, const vector *selectedzones) const; //caller frees result
const char *selectedzonesToHTML(const vector *chunks, const vector *selectedzones, char *newbody = 0, const size_t size = 0) const; //caller frees result if newbody = 0
const char *debugbody(vector *chunks, vector *selectedzones, const char *body, const size_t bodysize) const;
};
#endif