#include "InternetURIRequest.h" //parsing a URL into parts Filter InternetURIRequest::m_fParseAbsoluteToParts(1, 0, "\ ^([^:]*)://\ ([^/\\\\:]*)\ ((?::[0-9]+)?)\ [\\\\/]?\ (\ ((?:[^?]*/)?)\ ([^?]*\\.([^?.]{1,5})|[^?]*)\ ((?:\\?.*)?)\ )\ $"); InternetURIRequest::InternetURIRequest(Domain *_domain, const char *_absoluteURL, unsigned int _sweep): m_domain(_domain), m_absoluteURL(strdupCheck(_absoluteURL)), //freed in the destructor //malloc'd in parse() and freed in the destructor m_protocolText(0), m_port(0), m_href(0), m_folder(0), m_file(0), m_extension(0), m_query(0), m_contentType(&ContentType::notStatedContentType), m_protocol(notStatedProtocol), m_ir(0), m_sweep(_sweep ? _sweep : m_domain->sweep()), m_processed(false), m_isRepeating(false) { reserve(HTMLPAGE_NEWLINKS); //child requests (and their resource) } void InternetURIRequest::addParameter(const char *name, const char *value) { m_parameters.push_back(make_pair(name, value)); } void InternetURIRequest::parse() { //parse URL //the results of the Filter are malloc'd thus do not need to be freed until destruction //parameter parts for incomplete urls (all: \0 protocol:\1 domain:\2 port:\3 href:\4 folder:\5 file:\6 extension:\7 query:\8) vector parts; parts.reserve(9); //for holding the parsing of a url (+ the 0 match) const char *parts_domain = 0, *parts_whole = 0; //these are freed in the destructor if (!m_fParseAbsoluteToParts.submatches(m_absoluteURL, &parts)) { //did not match at all! DEBUGERROR("[%s]: URL could not be parsed [%s]", m_domain->m_domain, m_absoluteURL); throw InvalidAbsoluteURL(); } m_protocolText = parts[part_protocol]; m_port = parts[part_port]; m_href = parts[part_href]; m_folder = parts[part_folder]; m_file = parts[part_file]; m_extension = parts[part_extension]; m_query = parts[part_query]; //temporary stack variables: not used in the class (freed below) parts_whole = parts[part_whole]; parts_domain = parts[part_domain]; #ifdef _DEBUG if (!parts_whole || !parts_domain || !m_protocolText || !m_port || !m_href || !m_folder || !m_file || !m_extension || !m_query) { DEBUGERROR("[%s]: empty parts", m_domain->m_domain); exit(1); } #endif //additional check that the domains match (requirement of the class) if (!m_domain || !m_domain->m_domain || _STRCMP(parts_domain, m_domain->m_domain)) { //clear up temporary local stack variables and throw //this object still needs to be deleted to clear up the class malloc's //this* is trapped and destroyed by Domain (Domain is the only class allowed to create these) DEBUGPRINT("[%s]: domains do not match (class requirement) [%s] != [%s]", DEBUG_LINE, m_domain->m_domain, m_domain->m_domain, parts_domain); free((void*)parts_whole); free((void*)parts_domain); throw DomainMismatch(); } //not needed because referenced with domain and absoluteURL free((void*)parts_whole); free((void*)parts_domain); //protocol enum if (m_protocolText && *m_protocolText) { if (!_STRCASECMP(m_protocolText, "https")) m_protocol = https; else if (!_STRCASECMP(m_protocolText, "http")) m_protocol = http; else if (!_STRCASECMP(m_protocolText, "ftp")) m_protocol = ftp; else { m_protocol = unknownProtocol; m_domain->addUnknownProtocol(m_protocolText); } } //MIMEType enum if (m_extension && *m_extension) { m_contentType = ContentType::fromextension(m_extension); if (!m_contentType) { m_contentType = &ContentType::unknownContentType; m_domain->addUnknownFileExtension(m_extension); } } #ifdef _DEBUG //show implied content type DEBUGPRINT("[%s]: [%s](%s)=[%s]", DEBUG_LINE, m_domain->m_domain, m_absoluteURL, m_extension, m_contentType->MIMETypeText()); //check that the sum of the parts equals the original URL char URLparts1[2048], URLparts2[2048]; _SNPRINTF(URLparts1, 2047, "%s://%s%s/%s%s%s", m_protocolText, m_domain->m_domain, m_port, m_folder, m_file, m_query); _SNPRINTF(URLparts2, 2047, "%s://%s%s", m_protocolText, m_domain->m_domain, m_port); if (_STRCMP(URLparts1, m_absoluteURL) && _STRCMP(URLparts2, m_absoluteURL)) { DEBUGERROR("[%s]: URL parsing failure: [%s] != [%s] || [%s]", m_domain->m_domain, m_absoluteURL, URLparts1, URLparts2); throw URLParsingFailure(); } #endif } InternetURIRequest::~InternetURIRequest() { //releaseInternetResource(); if (m_absoluteURL) free((void*)m_absoluteURL); if (m_protocolText) free((void*)m_protocolText); if (m_port) free((void*)m_port); if (m_href) free((void*)m_href); if (m_folder) free((void*)m_folder); if (m_file) free((void*)m_file); if (m_extension) free((void*)m_extension); if (m_query) free((void*)m_query); } void InternetURIRequest::releaseInternetResource() { if (m_ir) { delete m_ir; m_ir = 0; } } const bool InternetURIRequest::addRelationship(InternetURIRequest *_parent) { bool added = false; if (_parent) { //check that the relationship does not already exist vector::iterator i = m_parents.begin(); while (i != m_parents.end() && *i != _parent) i++; if (i == m_parents.end()) { m_parents.push_back(_parent); //add parent _parent->push_back(this); //add child added = true; } } return added; } const bool InternetURIRequest::isRepeatingParents(const InternetURIRequest *checkIRQ, const int checkLevels) const { vector::const_iterator iParent; InternetURIRequest *parent; //check parents of current for a check digit == to this ones for (iParent = m_parents.begin(); iParent != m_parents.end(); iParent++) { parent = *iParent; if (parent && parent != checkIRQ) { if (parent->m_ir && parent->m_ir->equals(checkIRQ->m_ir)) break; //found a match if (checkLevels && parent->isRepeatingParents(checkIRQ, checkLevels - 1)) break; //found a match in parents (recursive) } } return (iParent != m_parents.end()); //true = match found } const bool InternetURIRequest::isRepeating() { //check to see if the number of query string vars is more than x ?fontsize=large&fontsize=small&fontsize=medium&fontsize=half& ... if (!m_isRepeating && m_query) { char c; const char *pos = m_query; unsigned int count = 0; while (c = *pos++) if (c == '&') count++; if (count >= SPIDER_IPR_QUERYCOUNT) m_isRepeating = true; } //check to see if any of the ancestor pages (x removed) are equal to this if (!m_isRepeating && m_ir) m_isRepeating = isRepeatingParents(this, SPIDER_IPR_PARENTSEARCH); //add repeater knowledge to the domain if (m_isRepeating) m_domain->addRepeater(this); return m_isRepeating; }