#include "Parser.h" #include "TIPsDatabase.h" #include "InternetResource.h" #include using namespace std; TIPsDatabase *Parser::m_db = 0; bool Parser::m_init = false; StringMultiMapCI Parser::m_entities; StringMap Parser::m_salutations; pthread_mutex_t Parser::m_hFirst_mutex = PTHREAD_MUTEX_INITIALIZER; FilterGroup Parser::m_tagfilters; StringMap Parser::m_md5s; Parser::Parser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type): m_domain(_domain), m_pagegroupid(_pagegroupid), m_type(_type) { m_db = _db; } size_t Parser::init() { pthread_mutex_lock(&m_hFirst_mutex); if (!m_init) { m_init = true; pthread_mutex_unlock(&m_hFirst_mutex); MSG0("[Parser]: Loading entity maps"); m_db->loadEntities(&m_entities); //load companies, countries, lakes, streets, etc. MSG0("[TIPsDatabase]: Loading md5 article map"); m_db->loadMD5s(&m_md5s); //case sensitive salutations (preceeded by a word break) DEBUGPRINT0("[Parser]: Loading salutation maps", DEBUG_LINE); m_salutations.insert(make_pair("Mr ", true)); m_salutations.insert(make_pair("Mr.", true)); m_salutations.insert(make_pair("Mrs ", true)); m_salutations.insert(make_pair("Mrs.", true)); m_salutations.insert(make_pair("Ms ", true)); m_salutations.insert(make_pair("Ms.", true)); //load tag filter relations DEBUGPRINT0("[Parser]: Loading tag filters", DEBUG_LINE); m_db->loadTagFilters(&m_tagfilters); } else pthread_mutex_unlock(&m_hFirst_mutex); return 0; } const size_t Parser::split(const char *string, vector *v, char delimiter, const bool malloc) const { //caller frees new contents of v //string remains untouched after function char c; char *i = (char*) string; const char *start = string; if (!string) return 0; do { c = *i; if (!c || c == delimiter) { char& r = *i; r = 0; while (*start && *start <= ' ' && start < i) start++; //skip initial whitespace v->push_back(malloc ? strdupCheck(start) : start); if (malloc) r = c; start = i + 1; } i++; } while (c); return 0; } const size_t Parser::searchForTagPhrases(const char *newbody, vector *tags, const bool malloc) const { //caller frees new contents of v //the tag is stored in the description member of the Filter Filter *f; for (multimap::const_iterator i = m_tagfilters.begin(); i != m_tagfilters.end(); i++) { f = i->second; if (f->match(newbody)) tags->push_back(malloc ? strdupCheck(f->description()) : f->description()); } return tags->size(); } const size_t Parser::searchForEntities(const char *newbody, vector *entityids) const { //caller manages everything const char *pos = newbody, *wordstart = 0; bool isname; char c, was = 0; //makes first character a word start boundary size_t len; char firstName[256]; //using the stakc for efficiency (we know that the word is going to be short) bool firstWordInSentence = true, //starts off as sentence beginning jumpConsecutiveNames; StringMultiMapCI::const_iterator icoCurrent, icoEnd = m_entities.end(); StringMap::const_iterator isa, isaEnd = m_salutations.end(); DBEntity *entity; unsigned int entityid; //traverse document (has HTML) while (c = *pos) { if (isWordStart(c, was)) { //word start boundary (UTF-8 ready) wordstart = pos; //mark word start isname = isUpper(c); //current wordstart is a name (bool) } else if (isWordEnd(c, was) && wordstart) { //word end boundary (UTF-8 ready) if (!isname) { //lowercase normal word jumpConsecutiveNames = false; } else if (!jumpConsecutiveNames) { //we have a name (maybe jump it) len = pos - wordstart; if (len < 255) { //create copy of name strncpy(firstName, wordstart, len); firstName[len] = 0; //zero terminated //check if it is a company icoCurrent = m_entities.find(firstName); //search for the company first name if (icoCurrent != icoEnd) { //cycle through the entities with this first word in length desc order for (; icoCurrent != icoEnd && !_STRCASECMP(icoCurrent->first, firstName); icoCurrent++) { entity = icoCurrent->second; //compare current entity (only doing companies at the moment) if (!entity->commonword() && strlicmp(wordstart, entity->name())) { if (dynamic_cast(entity) != 0) { entityid = entity->id(); //id to add if (find(entityids->begin(), entityids->end(), entityid) == entityids->end()) entityids->push_back(entity->id()); //add the id! if not there already } jumpConsecutiveNames = true; break; //don't compare any shorter ones } } } //check if it is a salutation else { isa = m_salutations.find(firstName); if (isa != isaEnd) { //we have a salutation... jumpConsecutiveNames = true; } } } } wordstart = 0; //look for new word firstWordInSentence = false; //had at least one word } //additional settings for the character (independent of word start / end) if (isGrammar(c)) jumpConsecutiveNames = false; //comma delimited names for example if (isSentenceEnd(was, c)) firstWordInSentence = true; if (c == '<') pos = strchr(pos, '>'); //jump all simple tags (but analyse all content) was = c; //record last character for phrase analysis if (!pos) break; pos++; } return entityids->size(); }