#include "Domain.h" TIPsDatabase *Domain::m_db = 0; Domain::Domain(const int _domainID, const char *_domain, const char *_rootPageTitle, TIPsDatabase *_db, Parser *_parser, const char *_startURL, FilterGroup *_browseAreas, const bool _persistent ): ResourceMonitor(0), //attach and detach only (see ResourceUseEventSink definition) m_domainID(_domainID), m_domain(strdupCheck(_domain)), m_rootPageTitle(strdupCheck(_rootPageTitle)), m_repeaters(0), m_md5hits(0), m_keepAlive(false), m_rootPage(0), m_sweep(1), m_googleCount(0), //initialised by Spider when one first arrives m_startURL(strdupCheck(_startURL)), //in conjunction with m_browseAreas can browse a section of a site m_browseAreas(_browseAreas), //caller frees these (maybe used in other places) m_persistent(_persistent), m_pagesSinceLastLoad(0) { m_db = _db; m_ConversationEndSchemes.assign(10,0); m_responseProtocols.assign(10,0); time(&m_created); MEMBER_INIT_MUTEX(m_hPopLink_mutex); MEMBER_INIT_MUTEX(m_hPages_mutex); //add default Parsers and Indexers (Filter* 0 = whole site) (Parser* 0 = do not parse the area) addParser(_parser); //attempt load domain from file (if exists) if (m_persistent) { DEBUGPRINT("[%s]: loading domain from file...", DEBUG_CHECK, m_domain); loadDomainFromFile(); DEBUG_RESULT_OK; } memoryDelta((int)(sizeof(Domain)), this); } Domain::~Domain() { if (m_persistent) saveDomainToFile(); memoryDelta(-(int)sizeof(Domain), this); if (m_domain) {free((char*)m_domain); m_domain = 0;} if (m_rootPageTitle) {free((char*)m_rootPageTitle); m_rootPageTitle = 0;} //friendly name of the Domain if (m_startURL) {free((char*)m_startURL); m_startURL = 0;} //Parsers and their FilterGroups for (vector >::const_iterator i = m_parsers.begin(); i != m_parsers.end(); i++) { if (i->first) delete i->first; delete i->second; } //this will also free the rootPage as it is a member of m_pages //for (StringMap::iterator it = m_pages.begin(); it != m_pages.end(); it++) delete it->second; m_rootPage = 0; pthread_mutex_destroy(&m_hPopLink_mutex); pthread_mutex_destroy(&m_hPages_mutex); //free stats strings StringMap::const_iterator iStat; for (iStat = m_unknownProtocol.begin(); iStat != m_unknownProtocol.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownFileExtension.begin(); iStat != m_unknownFileExtension.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownCharSet.begin(); iStat != m_unknownCharSet.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownContentType.begin(); iStat != m_unknownContentType.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownResponseProtocol.begin(); iStat != m_unknownResponseProtocol.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownLanguage.begin(); iStat != m_unknownLanguage.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownContentEncoding.begin(); iStat != m_unknownContentEncoding.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownCompression.begin(); iStat != m_unknownCompression.end(); iStat++ ) delete iStat->first; for (iStat = m_unknownConversationEndScheme.begin(); iStat != m_unknownConversationEndScheme.end(); iStat++ ) delete iStat->first; } size_t Domain::initParsers() { Parser *parser; for (vector >::const_iterator i = m_parsers.begin(); i != m_parsers.end(); i++) if (parser = i->second) parser->init(); return m_parsers.size(); } const size_t Domain::loadDomainFromFile() { FILE *fLoad; char *filename = (char*)mallocCheck(strlen(m_domain) + 20); char *buffer = (char*)mallocCheck(4096); char c; InternetURIRequest *link; sprintf(filename, "domains%s%s.txt", DIRSPLITTER, m_domain); _FOPEN(fLoad, filename, "r" ); if (fLoad) { DEBUGPRINT("[%s]: opened domain save file [%s] for reading", DEBUG_LINE, m_domain, filename); //fgets should read and transfer the character also to the destination string, followed by a 0 terminator pthread_mutex_lock(&m_hPages_mutex); while (fgets(buffer, 4096, fLoad)) { //returns a pointer to the buffer (0 if no bytes read) c = *buffer; if (c && c != '/' && c != '\n') { //skip comments (header) and blank lines //link starts at the 3rd char and has a character befoe the terminator buffer[strlen(buffer)-1] = 0; //remove the char link = new InternetURIRequest(this, buffer + 2); try { link->parse(); //Domain is a friend of Page m_pages.insert(make_pair(link->absoluteURL(), link)); if (c == '0') m_newlinks.push_back(link); //new links list else link->setProcessed(); //link done (or ignored or something) } catch (...) { DEBUGERROR("[%s]: Could not parse domain file [%s]", m_domain, buffer + 2); } } } pthread_mutex_unlock(&m_hPages_mutex); fclose(fLoad); } else DEBUGPRINT("[%s]: cannot open domain save file [%s] for reading (probably doesn't exist)", DEBUG_LINE, m_domain, filename); free(filename); free(buffer); DEBUGPRINT("[%s]: [%u] pages loaded from file", DEBUG_LINE, m_domain, m_pages.size()); m_pagesSinceLastLoad = 0; return m_pages.size(); } const size_t Domain::saveDomainToFile() { //save to file because we do not want to pass so much information over the net //defeats the point of speeding up the page retrieval DEBUGPRINT("[%s]: saving domain to file...", DEBUG_LINE, m_domain); FILE *fSave; char *filename = (char*)mallocCheck(strlen(m_domain) + 20); char line[1024]; //generic string space for the output time_t t = time(0); //current time struct tm *tmp = localtime(&t); InternetURIRequest *irq; //holder for all the new pages sprintf(filename, "domains%s%s.txt", DIRSPLITTER, m_domain); _FOPEN(fSave, filename, "w" ); if (!fSave) { DEBUGERROR("[%s]: cannot open file [%s] for writing", m_domain, filename); DEBUGERROR("[%s]: CannotOpenFile()", m_domain); free(filename); throw CannotOpenFile(); } //header fputs("//--------------------------------------------------\n", fSave); _SNPRINTF(line, sizeof(line), "//Creator: %s v%s.%s build %s (%s)\n", FEEDER_NAME, FEEDER_MAJORVERSION, FEEDER_MINORVERSION, FEEDER_LIVEBUILD, FEEDER_VERSIONNAME); fputs(line, fSave); strftime(line, sizeof(line), "//Date: %Y-%b-%d %H:%M:%S %z\n", tmp); //%T is not supported by windows fputs(line, fSave); fputs( "//Format: 1.0\n", fSave); _SNPRINTF(line, sizeof(line), "//Domain: %s\n", m_domain); fputs(line, fSave); _SNPRINTF(line, sizeof(line), "//URL Count: %u\n", m_pages.size()); fputs(line, fSave); _SNPRINTF(line, sizeof(line), "//New Count: %u\n", m_newlinks.size()); fputs(line, fSave); fputs("//--------------------------------------------------\n", fSave); //pages (tab delimited strings) fputs("//[got]\t[absolute url]\n", fSave); pthread_mutex_lock(&m_hPages_mutex); for (StringMap::const_iterator i = m_pages.begin(); i != m_pages.end(); i++) { irq = i->second; fputc(irq->processed() ? '1' : '0', fSave); fputc('\t', fSave); fputs(irq->absoluteURL(), fSave); fputc('\n', fSave); } pthread_mutex_unlock(&m_hPages_mutex); fclose(fSave); free(filename); DEBUGPRINT("[%s]: [%u] pages saved to file", DEBUG_LINE, m_domain, m_pages.size()); m_pagesSinceLastLoad = 0; return m_pages.size(); } void Domain::addParser(Parser *parser, FilterGroup *areas) { //Domain will delete both parser and areas at ~Domain() if (parser) m_parsers.push_back(make_pair(areas, parser)); } double Domain::pagesPerSecond() const { time_t now; time(&now); double seconds = difftime(now, m_created); return (m_pages.size() / seconds); } InternetURIRequest *Domain::createRootPage() { size_t len = strlen(m_domain) + strlen(m_startURL) + 9; char *rootPageURL = (char*)mallocCheck(len + 1); _SNPRINTF(rootPageURL, len, "http://%s%s", m_domain, m_startURL); DEBUGPRINT("[%s]: createRootPage", DEBUG_LINE, m_domain); m_rootPage = createPage(rootPageURL); free(rootPageURL); return m_rootPage; } InternetURIRequest *Domain::createPage(const char *_absoluteURL, InternetURIRequest *_parent, const char *_ifModifiedSince) { //InternetURIRequest copies the _absoluteURL. Caller controls _absoluteURL //to ensure that the URL (link) is unique in the Domain InternetURIRequest *link = 0; pthread_mutex_lock(&m_hPages_mutex); StringMap::iterator i = m_pages.find(_absoluteURL); //gets very big over time if (i == m_pages.end()) { //not found: create link = new InternetURIRequest(this, _absoluteURL); //this parse() could throw a DomainMismatch() | InvalidAbsoluteURL(): let it bubble up (no new link will be created) try { link->parse(); //Domain is a friend of Page } catch (...) { delete link; pthread_mutex_unlock(&m_hPages_mutex); throw; } m_pages.insert(make_pair(link->absoluteURL(), link)); if (!m_browseAreas || m_browseAreas->matchAny(_absoluteURL)) m_newlinks.push_back(link); //new links list //update relationship array(s) (if doesn't already exist) //we only do this on create new link: we don't want the home page to have loads of parent (everything links to it) if (_parent) link->addRelationship(_parent); } else { //found: already exists, link to it link = i->second; //point to existing Page* } pthread_mutex_unlock(&m_hPages_mutex); return link; } Domain::jobStatus Domain::popNewPage(InternetURIRequest **nextPage) { //there may be multiple SPiders on this domain so needs to be a critical section jobStatus newJobStatus = noWork; list::iterator i; pthread_mutex_lock(&m_hPopLink_mutex); if (m_newlinks.size()) { //more new links: get beginning of the new links vector //removed next call to this function i = m_newlinks.begin(); *nextPage = *i; m_newlinks.erase(i); //remove the page from new links immediately as this may be multi-threaded newJobStatus = newLink; } else { //no more new links: domain is finished DEBUGPRINT("[%s]: my work here is finished! sweep:[%u]", DEBUG_LINE, m_domain, m_sweep); m_sweep++; *nextPage = 0; newJobStatus = noWork; } pthread_mutex_unlock(&m_hPopLink_mutex); return newJobStatus; } void Domain::updateLastPage(InternetURIRequest *lastPage) { m_pagesSinceLastLoad++; //save this domain to file every DOMAIN_DOMAINSAVEDELTA completed pages (not necessarily successful) //saveDomainToFile() contains mutexing around m_pages //and also resets m_pagesSinceLastLoad if (m_persistent && m_pagesSinceLastLoad % DOMAIN_DOMAINSAVEDELTA == 0) saveDomainToFile(); } void Domain::addResourceSize(const size_t size) { size_t size_kb = (size_t)size/1000; map::iterator i = m_resourceSizes.find(size_kb); if (i == m_resourceSizes.end()) m_resourceSizes.insert(make_pair(size_kb, 1)); else { size_t count = i->second; m_resourceSizes.erase(i); m_resourceSizes.insert(make_pair(size_kb, count+1)); } } void Domain::addHTTPResponseCode(const int httpResponseCode) { map::iterator i = m_HTTPReturnCodes.find(httpResponseCode); if (i == m_HTTPReturnCodes.end()) m_HTTPReturnCodes.insert(make_pair(httpResponseCode, 1)); else { size_t count = i->second; m_HTTPReturnCodes.erase(i); m_HTTPReturnCodes.insert(make_pair(httpResponseCode, count+1)); } } void Domain::addRepeater(const InternetURIRequest *page) { m_repeaters++; } void Domain::addMD5Hit(const InternetURIRequest *page) { m_md5hits++; } Parser *Domain::parserFor(const InternetResource *resource) const { const char *relativeHREF; if (resource) { relativeHREF = resource->internetURIRequest()->relativeHREF(); //traverse the areas list from start to finish for (vector >::const_iterator i = m_parsers.begin(); i != m_parsers.end(); i++) //a null FIlterGroup or a matching one will return the associated Parser if (i->second->accepts(resource) && (!i->first || i->first->matchAny(relativeHREF))) return i->second; } return 0; }