#ifndef _DOMAIN_H #define _DOMAIN_H class TIPsDatabase; class InternetResource; class InternetURIRequest; #include "FilterGroup.h" #include "InternetURIRequest.h" #include "StringMap.h" #include "TIPsDatabase.h" #include "extensions.h" #include "ResourceUseEventSink.h" #include "Parsers.h" #include class Domain: public ResourceMonitor { static TIPsDatabase *m_db; const int m_domainID; const char *m_rootPageTitle; //friendly name of the Domain InternetURIRequest *m_rootPage; //the whole tree is hung off this Page* unsigned int m_sweep; //the number of times the whole domain has been completed by this program run unsigned int m_googleCount; //number of pages in the Domain as reported by Google time_t m_created; //for calculating average page/sec stats pthread_mutex_t m_hPopLink_mutex; //when asking for next link (multiple threaded critical section) pthread_mutex_t m_hPages_mutex; //changes to pages (saving, loading and creating) //custom Parsing: The Filter works on the URL //and allows Parsers to be attached to the whole (0) or areas of the site //the first Filter/Parser that applies is used. They are scanned in normal order //a default (Filter*=0) can be added first if the constructor is given the Parser vector > m_parsers; const char *m_startURL; //start place in a site FilterGroup *m_browseAreas; //limit the Spider to area(s) of a site const bool m_persistent; //if the domain is loaded and saved to a domain file of urls unsigned int m_pagesSinceLastLoad; //number of links processed since last loadFromFile() or instanciation //statistics vector m_documents; //Pages that have company names in them StringMap m_pages; //string key lookup to check if url is already found: NEVER DELETE ITEMS (just free the body) unsigned int m_repeaters; //pages that are under suspicion of infinite repeat according to isRepeating() unsigned int m_md5hits; //articles that are already in the DB map m_HTTPReturnCodes; //return code statistics map m_resourceSizes; //page size stats StringMap m_unknownProtocol; StringMap m_unknownFileExtension; StringMap m_unknownCharSet; StringMap m_unknownContentType; StringMap m_unknownResponseProtocol; StringMap m_unknownLanguage; StringMap m_unknownContentEncoding; StringMap m_unknownCompression; StringMap m_unknownConversationEndScheme; vector m_ConversationEndSchemes; vector m_responseProtocols; bool m_keepAlive; friend class Page; friend class InternetURIRequest; //reports access all parts of all relevant classes friend class Report; friend class Report_Full; friend class Report_DomainSummary; public: enum jobStatus { newLink, refreshPage, noWork }; const char *m_domain; list m_newlinks; Domain(const int _domainID, const char *_domain, const char *_rootPageTitle = 0, TIPsDatabase *_db = 0, Parser *_Parser = 0, const char *_startURL = "/", FilterGroup *_browseAreas = 0, const bool persistent = false); ~Domain(); void addParser(Parser *parser, FilterGroup *areas = 0); void addParser(Parser *parser, Filter *area) {addParser(parser, new FilterGroup(area));} void addParser(Parser *parser, const char *regex) {addParser(parser, new Filter(1, 1, regex));} size_t initParsers(); Parser *parserFor(const InternetResource *resource) const; //page management InternetURIRequest *createRootPage(); InternetURIRequest *createPage(const char *_absoluteURL, InternetURIRequest *_parent = 0, const char *_ifModifiedSince=0); jobStatus popNewPage( InternetURIRequest **nextPage); void updateLastPage(InternetURIRequest *lastPage); const size_t saveDomainToFile(); const size_t loadDomainFromFile(); //accessors const int domainID() const {return m_domainID;} unsigned int sweep() const {return m_sweep;} double pagesPerSecond() const; unsigned int googleCount() const {return m_googleCount;} void googleCount(const unsigned int _googleCount) {if (!m_googleCount) m_googleCount = _googleCount;} //statistics void addHTTPResponseCode(const int httpResponseCode); void addRepeater(const InternetURIRequest *page); void addMD5Hit(const InternetURIRequest *page); void addDocument(const InternetURIRequest *page) {if (page) m_documents.push_back(page);} void addResourceSize(const size_t size); void addUnknownProtocol(const char *item) {if (item) m_unknownProtocol.insert( make_pair(_STRDUP(item),0));} void addUnknownFileExtension(const char *item) {if (item && *item) m_unknownFileExtension.insert(make_pair(_STRDUP(item),0));} void addUnknownCharSet(const char *item) {if (item) m_unknownCharSet.insert( make_pair(_STRDUP(item),0));} void addUnknownContentType(const char *item) {if (item) m_unknownContentType.insert( make_pair(_STRDUP(item),0));} void addUnknownResponseProtocol(const char *item) {if (item) m_unknownResponseProtocol.insert( make_pair(_STRDUP(item),0));} void addUnknownLanguage(const char *item) {if (item) m_unknownLanguage.insert( make_pair(_STRDUP(item),0));} void addUnknownContentEncoding(const char *item) {if (item) m_unknownContentEncoding.insert( make_pair(_STRDUP(item),0));} void addUnknownCompression(const char *item) {if (item) m_unknownCompression.insert( make_pair(_STRDUP(item),0));} void addUnknownConversationEndScheme(const char *item) {if (item) m_unknownConversationEndScheme.insert( make_pair(_STRDUP(item),0));} void addConversationEndScheme(size_t c) {m_ConversationEndSchemes[c] = c;} void addResponseProtocol(size_t c) {m_responseProtocols[c] = c;} void addKeepAlive() {m_keepAlive = true;} //exceptions class CannotOpenFile {}; }; #endif