#ifndef _PROTOCOLS_H #define _PROTOCOLS_H /* Protocols are statefull. They hold information about: headers length of input session current request (with Domain and url) */ #include class Spider; class InternetURIRequest; class InternetResource; class HTTP; class FTP; class HTTPS; class ContentType; #include "define_platform.h" #include "ContentType.h" #include "InternetResource.h" //includes all the base classes (HTMLPage, CSS, Javascript, etc.) #include "InternetURIRequest.h" #include "DomainConnection.h" #include "Parser.h" class Protocol: public DomainConnectionEventSink, protected Stream { public: enum conversationResult { notStarted = 0, //initial state unknownConversationResult, //dunno busy, //DOS exception, //exception that needs throwing globaltimeout, //Protocol timed out waiting for first DC event timeout, //DC timedout during read unknownCharset, //did not recognise the character set (may be valid anyway...) unknownContentencoding, //compression or transfer encoding not recognised or failed contentDecodeFailed, //failed during content decode unknownContenttype, //actual MIME content type unknown (won't have created the IR) noBody, //the finalise functions could not find the document failed, //Exceptions maybe, //something went wrong but there is partial/full data some, //some data was returned but not all ok //normal completion } m_conversationResult; struct PersistentData { char *m_referrer; //where the call came from to get here char *m_PHPsessionID; //PHP between request id char *m_ASPsessionID; //ASP between request id }; protected: //these enums are protected because the Protocol should hide compression and char set complexities from the client //they are global and relevant to all Protocols contentLanguage m_contentLanguage; ContentType *m_contentType; enum charSet { //http://www.iana.org/assignments/character-sets unknownCharSet = 0, notStatedCharSet, us_ascii, iso_8859_1, iso_8859_15, windows_1252, utf_8 } m_charSet; enum compression { unknownCompression = 0, notStatedCompression, noCompression, deflate, gzip } m_compression; enum contentEncoding { unknownContentEncoding = 0, notStatedContentEncoding, noContentEncoding, chunked //HTTP/1.1 specific handled by overridden contentDecode() } m_contentEncoding; enum conversationEndScheme { unknownConversationEndScheme = 0, notStatedConversationEndScheme, chunkedEncoding, closeConnection, contentLength } m_conversationEndScheme; enum status { created, connecting, written, reading, red, reconnecting, waitingForReconnectionAttempt, stopped } m_status; static TIPsDatabase *m_db; //for passing through to InternetResources InternetURIRequest *m_ir; //the InternetURIRequest being processed (may affect how the stream is interpreted) InternetResource **m_resource; //the resultant resource compiled by getResource() call. Can be null DomainConnection *m_dc; //the comms link for the conversation (not necessarily just request - response) const char *m_domain; //cache of the string domain name from m_ir vector m_links; //links found in the Protocol information (e.g. HTTP 302 locations) PersistentData *m_permData; //permanent cross-Protocol persistent information (like HTTP session data) const bool m_manageBuffer; //passed through to the IR char *m_buffer; //buffer to use is passed into the protocol system to fill out const size_t m_buffersize; size_t m_totalBytesTransfered; //Protocol needs to know how much of the buffer is relevant const char *m_chunkedCurrentLabelPos; //for chunked transfer encoding only size_t m_chunkedLastSize; //for chunked transfer encoding only unsigned int m_keepalive; //socket level keep alive system bool m_connectionKeepAlive; //socket level keep alive system //potential finalise parts all called from finalise(). May not be used in all Protocols virtual conversationResult decompress() {return ok;} //compresion like gzip (compression enum) virtual conversationResult contentDecode() {return ok;} //things like chunked transfer encoding (contentEncoding enum) virtual conversationResult charsetDecode() {return ok;} // * -> UTF-8 (charSet enum) virtual conversationResult translate() {return ok;} // * -> english (contentLanguage enum) virtual conversationResult createInternetResource() {return ok;} // **InternetResource (contentType enum) virtual conversationResult updatePersistentData() {return ok;} //session data etc. cross-Protocol //stream events virtual conversationResult finalise(); //calls many other virtual functions to translate the final reply virtual const bool checkForEOS() {return false;} //for checking EOS, can use following function const bool checkForEOS(const char *eos) const; //generic utility function const int chunkSize(const char *chunkLabel, size_t *chunkSize, size_t *labelLength = 0); //xtoi get the chunk size \r\n from the chunk label virtual const char *generateRequest() const = 0; //returns the conversation start sentence virtual void finishConversation(conversationResult _result = ok); //clears the conditional wait so that the Protocol returns pthread_cond_t m_conversation_cond; pthread_mutex_t m_conversation_mutex; //virtual generic inherited match functions (need to be re-implemented in derived classes) Protocol(TIPsDatabase *_db, DomainConnection *_dc, InternetURIRequest *ir, InternetResource **_resource, char *_buffer, const size_t _buffersize, PersistentData *_permData, const bool _manageBuffer = false); friend ostream& operator<<(ostream &o, Protocol &p); //for toString() public: virtual ~Protocol(); //just in case it is overriden to release mem //factory function (works off the url) static Protocol *createProtocol( TIPsDatabase *_db, //for creating the InternetResource DomainConnection *_dc, //for the Protocol to talk to the server on (held by the Spider) InternetURIRequest *_ir, //The URI request (contains breakdown of the URI) InternetResource **m_resource, //the resultant resource compiled by getResource() call char *_buffer, //the buffer to pass to the DC for replies from the server (held by the Spider) const size_t _buffersize, PersistentData *_permData = 0, //a previous protocol instance that may need to sync persistent session data const bool _manageBuffer = false); //passes to the IR that would release buffer in ~InternetResource() virtual const char *description() const {return "Protocol base class";} //Exceptions class UnknownProtocol {}; //DomainConnection interface Event sink. DC calls here when a flag changes //all events MUST either do something else that will: // 1) trigger a new event or, - continue/retry the conversation // 2) finishConversation() - return control to the Spider //Default behaviours: virtual void finishedRead(const int bytes); //continue reading or checkEOS and finish virtual void finishedWrite(const int bytes); //start read virtual void somethingReady() {} //(not used) virtual void timedoutOnRead(); //finish virtual void timedoutOnWrite(); //re-connect and re-try virtual void outOfSyncRead(); //ignore (read and discard) data and continue for next write event virtual void outOfSyncWrite(); //finish virtual void threwException(); //re-try connection and continue virtual void connectionClosedOnRead(const size_t bufspace); //potential standard behaviour: override in derived virtual void connectionClosedOnWrite(); //re-try connection and same write virtual bool outOfBufferSpace(); //finish virtual void completeFailure(); //re-try connection and same write virtual void customException(DomainConnection::DomainConnectionException &e); //custom Exception on event loop thread DomainConnection::DomainConnectionException m_exception; //for passing the exception between threads //stream output events virtual conversationResult getResource(const u_int timeout = DC_TIMEOUT); //fills out the InternetResource** virtual const vector *links() const {return &m_links;} //not copy! }; #endif