/************************************************************************** THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. Copyright 1997 Microsoft Corporation. All Rights Reserved. **************************************************************************/ /************************************************************************** File: ExplorerBar.cpp Description: Implements CExplorerBar **************************************************************************/ /************************************************************************** #include statements **************************************************************************/ #include "ExplrBar.h" #include "Guid.h" //#include "strsafe.h" #include //sprintf #include "Tchar.h" //_stprintf, _tcs* #include ".\explrbar.h" #include "DispEx.h" #include "msxml2.h" /************************************************************************** CExplorerBar::CExplorerBar() **************************************************************************/ StringMapCI CExplorerBar::Tag::m_tags; //HTML tags lookup StringMapCI CExplorerBar::m_entities; //entity lookup ("e; etc.) vector CExplorerBar::m_entity_chars; //entity -> chars StringMapCI CExplorerBar::m_attributes; //attribute lookup //static tag initialisation (enters into a map) //For inline/block we distinguish between tags that are: // 1. used always in text in-line just for in-line formatting (converted to formats in the Document class) // 2. used almost always in text block for layout (converted to \n) // 3. unknown usage (may require CSS analysis to be sure) // 4. everything else: commonly used in unwanted markup //processing instructions CExplorerBar::Tag CExplorerBar::tag_comment("--", PI, JUMP, BLOCK, NCSS, NORM, "-->"); //special jump code, find code etc. (does not require a space after name else { //normal tagname resolution tagEnd = tagStart; while ((c = *tagEnd) && c != '>' && c != '\n' && c != '\r' && c != '/' && c != ' ') tagEnd++; //nearest space or > or / (
) or EOF } if (c) thisTag = Tag::tag(tagStart, tagEnd); //EOF check, we have the tag name now :) return thisTag; } CExplorerBar::attribute CExplorerBar::getattributename(const char *attributeStart) const { const char *attributeEnd; char c; size_t attributeLength; attribute thisAttribute = unknownAttribute; StringMapCI::const_iterator iAttribute; char attributeName[32]; //using stack for performance and heap de-fragmentation //normal tagname resolution attributeEnd = attributeStart; while ((c = *attributeEnd) && ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9'))) attributeEnd++; //nearest space or > or / (
) or EOF if (c) { //EOF check if (attributeLength = attributeEnd - attributeStart) { //length may be 0 in the case of <[> /$] if (attributeLength < 32) { //maximum tagname length restriction strncpy_s(attributeName, attributeLength+1, attributeStart, attributeLength); attributeName[attributeLength] = 0; iAttribute = m_attributes.find(attributeName); //we have the tag name now: if (iAttribute != m_attributes.end()) thisAttribute = iAttribute->second; //found! } } } return thisAttribute; } CExplorerBar::entity CExplorerBar::getentityname(const char *entityStart) const { const char *entityEnd; char c; size_t entityLength; entity thisEntity = unknownEntity; StringMapCI::const_iterator iEntity; char entityName[32]; //using stack for performance and heap de-fragmentation while (*entityStart == '&' || *entityStart == '#') entityStart++; //after & or # or to EOF entityEnd = entityStart; while ((c = *entityEnd) && c != ';' && c != '\n' && c != '\r' && c != ' ') entityEnd++; //nearest space or ; or EOF if (c) { //EOF check if (entityLength = entityEnd - entityStart) { //length may be 0 in the case of &[;$ ] if (entityLength < 32) { //maximum tagname length restriction strncpy_s(entityName, entityLength+1, entityStart, entityLength); entityName[entityLength] = 0; iEntity = m_entities.find(entityName); //we have the name now: if (iEntity != m_entities.end()) thisEntity = iEntity->second; //found! } } } return thisEntity; } const char *CExplorerBar::parse(const char *body, size_t bodysize) const { //this parsing process requires many diverse custom rules //that take effect at different stages //thus there are several parsing techniques employed all over the place if (!bodysize) bodysize = strlen(body); //chunks of stuff vector chunks; chunks.reserve( bodysize / 100); vector markupareas; markupareas.reserve( bodysize / 100); vector selectedzones; selectedzones.reserve(bodysize / 50000 + 10); char *codestring; //represents the document in letter codes e.g. H1T4Il = heading 1, Text > 40 words, Linked Image //document level averages, counts and maximums Tag *maxHeadingTag; unsigned int numHeadings; unsigned int totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize, maxDepth; PERCENTAGE avgTextHTMLDensity; PERCENTAGE avgAlphaNumeric; unsigned int avgWordDensity; //processing path: //parse the HTML into text only chunks and also related in-line continuous markup/text areas //and calculate text-chunk level statistics //this is the only function that accesses the body, //the rest works of the text chunks and areas getTextChunks(body, bodysize, &chunks, &markupareas); //output: chunks, markupareas, chunkformats //calculate document averages over all text chunks and HTML in-line areas calcAggregates(&chunks, &markupareas, //(passed by reference by the function definition) totalWordCount, totalChunksSize, totalAlphaNumerics, totalBytesSize, maxDepth, &maxHeadingTag, numHeadings, avgTextHTMLDensity, avgAlphaNumeric, avgWordDensity ); //Generate the encoded string representation of the chunk types e.g. H1T4Il = heading 1, Text > 40 words, Linked Image generateCodeString(&chunks, &markupareas, &codestring); //output: codestring //select zones of markup areas selectZones(&chunks, &markupareas, &selectedzones); //output: selectedzones //write a new HTML document with all debug in //caller responsible for freeing newbody const char *newbody = writeNewBody(&chunks, &markupareas, &selectedzones, body, bodysize); free((void*)codestring); return newbody; } const size_t CExplorerBar::generateCodeString(vector *chunks, vector *markupareas, char **rcodestring) const { //caller frees result char *codestring = (char*) mallocCheck(markupareas->size() * 4 + 1); *codestring = 0; //immediate zero terminate char *pos = codestring; //pos for progressive writing *rcodestring = codestring; //return pointer vector::const_iterator iMUA, iMUABegin = markupareas->begin(), iMUAEnd = markupareas->end(); size_t numberChunks; //number of chunks in the MUA (always 1 for headings, images, alts) markuparea mua; //current MUA textchunk tc1, //first textchunk in the MUA tc; //any general textchunk chunktype type1; //the type of the first textchunk (also = type for the MUA in the case of headings etc.) //basic MUA properties bool hasHeading, //0, tag_hx, tag_b, tag_font etc. isTextChunk, //chunk_text, chunk_image, chunk_alt isValidTextArea, //MUA we want hasLink; //if the current MUA is a link MUA for (iMUA = iMUABegin; iMUA != iMUAEnd; iMUA++) { mua = *iMUA; //iterator boundary checks #ifdef _DEBUG if (mua.firstchunk >= chunks->size()) {char debug[1024];sprintf(debug, "iterator mismatch: mua.firstchunk >= chunks->size() MUA:%u", iMUA - iMUABegin);DEBUGERROR0(debug);} if (mua.lastchunk >= chunks->size()) {char debug[1024];sprintf(debug, "iterator mismatch: mua.lastchunk >= chunks->size() MUA:%u", iMUA - iMUABegin);DEBUGERROR0(debug);} if (mua.firstchunk > mua.lastchunk ) {char debug[1024];sprintf(debug, "iterator mismatch: mua.firstchunk > mua.lastchunk MUA:%u", iMUA - iMUABegin);DEBUGERROR0(debug);} #endif numberChunks = mua.lastchunk - mua.firstchunk; //should never be 0 (0 chunk areas are not created) tc1 = chunks->operator [](mua.firstchunk); //the first and possibly only chunk type1 = tc1.type; //its type hasHeading = (tc1.headingTag != 0); //0, tag_hx, tag_b, tag_font etc. isTextChunk = (type1 == chunk_text); //chunk_text, chunk_image, chunk_alt hasLink = (tc1.hrefStart != 0); isValidTextArea = isTextChunk && !hasHeading && mua.maxA1Density > 50 && mua.wordCount > 10; //identify special chunks /* for (i = iChunksBegin + mua.firstchunk; i != iChunksBegin + mua.lastchunk; i++) { //... } */ //size modifier //if (mua) //base type switch (type1) { case chunk_text: { if (isValidTextArea) sprintf(pos, "A%u%c", 1, 'd'); else sprintf(pos, "T%u%c", 1, 'd'); break; } case chunk_image: case chunk_alt: case chunk_content: {break;} } while (*pos) pos++; } return pos-codestring; } const size_t CExplorerBar::getTextChunks(const char *body, size_t bodysize, vector *chunks, vector *markupareas) const { //using text - tag density to spot relevant content //http://ai-depot.com/articles/the-easy-way-to-extract-useful-text-from-arbitrary-html/#more-90 //although we want to calculate the dense areas of text, //we also want to include lots of formatting so that the doucment can be re-produced from these selections //this function outputs chunks of HTML //thus: h1-6, font+/-, bold, italic, in-text images, etc. need to be included in the output from this function //declare local to keep re-entrant const char *bodyend = body + bodysize; vector::const_iterator i; //general cunks iterator vector csspath; csspath.reserve(200); //linking in the CSS hierarchy for calculating block level elements //------------------------------------------------------------- 1st pass: //compile text chunks into an array of positions and properties (no stats or selection yet) //anything the requires traversing and parsing (the rest is set calcs) char c; //current character const char *pos = body, //place in body parse *chunkStart = 0, //chunk of text (0 = no current chunk) *areaStart = 0, //area of HTML and text flow (0 = no current chunk) *blockStart = 0; //HTML tag block like unsigned int wordCount = 1, //counts intial word boundaries so start at 1 areaWordCount = 0, //words in the area alphanumerics = 0, //num of alphanumeric chars sentences = 1, //sentence count (dot space combos + 1) areaSentences = 0; //sentences in area size_t areaFirstChunk, //the index of the first chunk in the area areaLastChunk; //the index of the last+1 chunk in the area const char *lastFinish = body; size_t textLength, bytesLength; PERCENTAGE a1density = 0, //the density for the text chunk maxAIDensity = 0; //the maximum density for the textarea Tag *currenttag, *droptag, *adjacentTagBefore, //only valid if there was a tag immediately before this one e.g.
*lasttag, //the last tag e.g. some text
*headingTag = 0; //if the current text has a heading level bool starttag, //, not selfenclosedtag, //
jumpTag, // processingInstruction; // attribute currentattribute; //tag attribute recognition const char *attributeValueStart; bool quoted; //quoted attributes allow spaces //start finish pairs pointing into the main document (not zero terminated) const char *hrefStart = 0, //if the current text is in a link *hrefFinish = 0, //end of link *cssidStart = 0, *cssidFinish = 0, *classnameStart = 0, *classnameFinish = 0, *styleStart = 0, *styleFinish = 0; while (pos < bodyend && (c = *pos)) { //EOF check, pos valid check //each section deals with multi-increments of pos accordingly //zero terminator also deals with stristr(...) requests //pos is set to bodyend if it is accidentally set to 0 currenttag = 0; if (c == '<') { //we have a tag: can occur multiple times before a text break currenttag = gettag(pos); starttag = (pos[1] != '/'); //will be true for comment starts ( if (!pos) pos = bodyend; jumpTag = true; } //) look for the end of the end tag (>) or EOF: the characters in the tag are not counted in chunks cssidStart = 0; cssidFinish = 0; classnameStart = 0; classnameFinish = 0; styleStart = 0; styleFinish = 0; while ((c = *pos) && c != '>') { //skip to next tag-end or EOF pos++; if (c <= ' ' && (c = *pos) && ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { //attribute start " checked ", " id=...", !" = " is not (=) //or erroneous value start: but we will simply get an unknownAttribute in this case currentattribute = getattributename(pos); } else if (c == '=') { //we have a value for an registered, identifieable attribute //if we have a currentattribute then it is a registered one and we are thus interested in the value //move to standard place at beginning of name after '"=\s to EOF quoted = false; while ((c = *pos) == '=' || c <= ' ' || c == '"' || c == '\'') { if (c == '"' || c == '\'') quoted = true; pos++; } attributeValueStart = pos; //nearest space or > or / (
) or EOF. Allow spaces and > if there were quotes at the beginning while ((c = *pos) && !( //criteria for the end of the attribute: ( quoted && (c == '"' || c == '\'')) //if quoted then a " ends the attribute || (!quoted && (c == '>' || c <= ' ')) //if !quoted then white space or > ends the attribute ) ) pos++; textLength = pos - attributeValueStart; //check for id, class and alt attributes switch (currentattribute) { case attribute_src: { if (tag_img == currenttag) { const textchunk tc = {chunk_image, attributeValueStart, pos, textLength, textLength, hrefStart, hrefFinish, 0, textLength, 0, headingTag, 0, //textDensity 0, //wordDensity 0, //a1Density markupareas->size(), csspath.size(), csspath }; chunks->push_back(tc); //(runs mem copy constructor for line) } break; } case attribute_id: { cssidStart = attributeValueStart; cssidFinish = pos; break; } case attribute_class: { classnameStart = attributeValueStart; classnameFinish = pos; break; } case attribute_style: { styleStart = attributeValueStart; styleFinish = pos; break; } case attribute_alt: { const textchunk tc = {chunk_alt, attributeValueStart, pos, textLength, textLength, hrefStart, hrefFinish, 0, textLength, 0, headingTag, 0, //textDensity 0, //wordDensity 0, //a1Density markupareas->size(), csspath.size(), csspath }; chunks->push_back(tc); //(runs mem copy constructor for line) break; } case attribute_content: { const textchunk tc = {chunk_content, attributeValueStart, pos, textLength, textLength, hrefStart, hrefFinish, 0, textLength, 0, headingTag, 0, //textDensity 0, //wordDensity 0, //a1Density markupareas->size(), csspath.size(), csspath }; chunks->push_back(tc); //(runs mem copy constructor for line) break; } case attribute_onclick: { break; } case attribute_href: { hrefStart = attributeValueStart; hrefFinish = pos; break; } } currentattribute = unknownAttribute; //processed } } selfenclosedtag = (*(pos-1) == '/') || processingInstruction; //tags that are self-enclosed
pos++; //go one past tag-end (if not EOF) //) linking in the CSS hierarchy for calculating block level elements //need to link in the id and class attributes if (!jumpTag && !selfenclosedtag && currenttag && !processingInstruction) { if (starttag) { //going deeper cssinstance cssi = {currenttag, cssidStart, cssidFinish, classnameStart, classnameFinish, styleStart, styleFinish}; csspath.push_back(cssi); } else { //returning up a level (not necessarily valid XHTML img, br etc. so check levels) //

: need to clear back down to the div level, ignoring the (not) img close //
: this will fail, falling back to the last span if (csspath.size()) { //... > div > img > br do { droptag = csspath.back().t; //br (copy) csspath.pop_back(); //... > div > img } while (csspath.size() && droptag != currenttag); //br != div } } lasttag = currenttag; adjacentTagBefore = currenttag; } } else if (c == '&') { //ignore entities (confuses alphanumerics count on small text chunks) while ((c = *++pos) && ( (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '#' || c == ';' )) 000; //skip to entity-end or EOF } else if (c <= ' ' || c == '/' || c == '\\') { //ignore initial white-space, illegal chars (EOF check done already in while condition) //word counting c = *++pos; if (chunkStart && c > ' ') wordCount++; //space word-start count + 1 } else if (c == '.' || c == '?' || c == '!') { //ignore initial dot space combos //sentence counting c = *++pos; if (chunkStart && c <= ' ') sentences++; //dot space combos + 1 } else { //alphanumerics counting (and chunk starting) if (!chunkStart) chunkStart = pos; //start new sentence if not one in progress if (!areaStart) { //start new area if not one in progress areaStart = pos; areaFirstChunk = chunks->size(); } if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) alphanumerics++; //do not skip to the next < tag start here because we need to count words and alphanumeric pos++; } if (!currenttag) adjacentTagBefore = 0; if (!pos) pos = bodyend; } //if text flows up to the EOF just forget it //if the last tag is an in-flow tag before EOF then we need to complete the last markup area //important because the text chunks may reference it if (areaStart) { areaLastChunk = chunks->size(); if (areaLastChunk > areaFirstChunk) { const markuparea mua = {areaStart, pos, maxAIDensity, areaWordCount, areaSentences, currenttag, areaFirstChunk, areaLastChunk-1}; markupareas->push_back(mua); } } return chunks->size(); } const size_t CExplorerBar::calcAggregates(vector *chunks, vector *markupareas, unsigned int &totalWordCount, unsigned int &totalChunksSize, unsigned int &totalAlphaNumerics, unsigned int &totalBytesSize, unsigned int &maxDepth, Tag **maxHeadingTag, unsigned int &numHeadings, PERCENTAGE &avgTextHTMLDensity, PERCENTAGE &avgAlphaNumeric, unsigned int &avgWordDensity ) const { //------------------------------------------------------------- 2nd pass: calc averages and totals vector::const_iterator i; //general cunks iterator totalWordCount = 0; totalChunksSize = 0; totalBytesSize = 0; totalAlphaNumerics = 0; maxDepth = 0; *maxHeadingTag = &tag_h6; //higher value than tag_h1 numHeadings = 0; textchunk tc; for (i = chunks->begin(); i != chunks->end(); i++) { tc = *i; totalWordCount += tc.wordCount; totalChunksSize += tc.textLength; totalBytesSize += tc.bytes; totalAlphaNumerics += tc.alphanumerics; if (tc.headingTag != 0) numHeadings++; if (tag_h1 <= tc.headingTag && tag_h6 >= tc.headingTag && tc.headingTag->operator>(*maxHeadingTag)) *maxHeadingTag = tc.headingTag; if (tc.depth > maxDepth) maxDepth = tc.depth; } avgTextHTMLDensity = totalChunksSize * 100 / totalBytesSize; avgAlphaNumeric = totalAlphaNumerics * 100 / totalChunksSize; avgWordDensity = totalWordCount * 1000 / totalBytesSize; return chunks->size(); } const size_t CExplorerBar::selectZones(vector *chunks, vector *markupareas, vector *selectedzones ) const { //------------------------------------------------------------- 4th pass: grab interesting areas vector::const_iterator iMUA, iMUABegin = markupareas->begin(), iMUAEnd = markupareas->end(), iFirstBackHeading = iMUAEnd, //position of the closest first back heading iFirstValidMarkupArea = iMUAEnd; //position of the valid textarea start vector::const_iterator i, //general cunks iterator iChunksBegin = chunks->begin(), iChunksEnd = chunks->end(); size_t numberChunks; //number of chunks in the MUA (always 1 for headings, images, alts) markuparea mua; //current MUA textchunk tc1, //first textchunk in the MUA tc; //any general textchunk chunktype type1; //the type of the first textchunk (also = type for the MUA in the case of headings etc.) //basic MUA properties bool hasHeading, //0, tag_hx, tag_b, tag_font etc. isTextChunk, //chunk_text, chunk_image, chunk_alt isValidTextArea, //MUA we want hasLink, //if the current MUA is a link MUA //inter-relationship booleans haveBackHeading = false, //if we have found a heading since the last isValidTextArea haveFirstValidMUA = false, //if we have found an isValidTextArea hasLinkedHeadings = false, //if any of the headings in the last heading group hasLink haveBackHeadingEnd = false; //if we have found the end of the consecutive headings group //situation of areas in relation to siblings //search forwards through the areas //calcs - output all areas with > 40% a1density //combine them with previous text areas > 40% a1density //also all images and alt in, just after or just before selected areas //also closest previous heading group (if close) //link h1 links and ".*More.*" links to the target document in DB //max heading state to make them relative (maxHeadingTag) //look for special areas: // link | link | link | link | link etc. // ul - li lists with no links in // proper data tables for (iMUA = iMUABegin; iMUA != iMUAEnd; iMUA++) { mua = *iMUA; //iterator boundary checks #ifdef _DEBUG if (mua.firstchunk >= chunks->size()) {char debug[1024];sprintf(debug, "iterator mismatch: mua.firstchunk >= chunks->size() MUA:%u", iMUA - iMUABegin);DEBUGERROR0(debug);} if (mua.lastchunk >= chunks->size()) {char debug[1024];sprintf(debug, "iterator mismatch: mua.lastchunk >= chunks->size() MUA:%u", iMUA - iMUABegin);DEBUGERROR0(debug);} if (mua.firstchunk > mua.lastchunk ) {char debug[1024];sprintf(debug, "iterator mismatch: mua.firstchunk > mua.lastchunk MUA:%u", iMUA - iMUABegin);DEBUGERROR0(debug);} #endif numberChunks = mua.lastchunk - mua.firstchunk; //should never be 0 (0 chunk areas are not created) tc1 = chunks->operator [](mua.firstchunk); //the first and possibly only chunk type1 = tc1.type; //its type hasHeading = (tc1.headingTag != 0); //0, tag_hx, tag_b, tag_font etc. isTextChunk = (type1 == chunk_text); //chunk_text, chunk_image, chunk_alt hasLink = (tc1.hrefStart != 0); isValidTextArea = isTextChunk && !hasHeading && mua.maxA1Density > 50 && mua.wordCount > 10; //identify special chunks /* for (i = iChunksBegin + mua.firstchunk; i != iChunksBegin + mua.lastchunk; i++) { //... } */ //finish the last one first because the new MUA may be a heading group start //see if we have a start and a finish //remember that isValidTextArea cannot be a heading if (haveFirstValidMUA && !isValidTextArea) { //add the area to the selected list selectedzone sz = { (haveBackHeading ? iFirstBackHeading - iMUABegin : iFirstValidMarkupArea - iMUABegin), (iMUA - iMUABegin) - 1, //we are one passed the last valid MUA mua.wordCount, hasLinkedHeadings }; selectedzones->push_back(sz); //reset haveBackHeading = false; //because it related to this zone, not the next haveBackHeadingEnd = false; haveFirstValidMUA = false; hasLinkedHeadings = false; } //see if we have a *new* heading group beginning (possibly overriding an already registered one) //note that haveBackHeadingEnd only occurs after a non-heading MUA is found (i.e. a new group) //must hasHeading for this so that heading groups are not wiped during isValidTextArea if (hasHeading && (!haveBackHeading || haveBackHeadingEnd)) { iFirstBackHeading = iMUA; haveBackHeading = true; haveBackHeadingEnd = false; hasLinkedHeadings = false; } //check for the end of the back heading group (so we can start a new one for non-consecutive headings) //haveBackHeadingEnd will always be true for isValidTextArea //what is important is the haveBackHeading and its position if (haveBackHeading && !hasHeading) { haveBackHeadingEnd = true; } //check for linked headings //any heading will be in the heading group //reset when a new heading group is started if (hasHeading && hasLink) { hasLinkedHeadings = true; } //see if we have a *new* valid text MUA //remember that isValidTextArea cannot be a heading if (!haveFirstValidMUA && isValidTextArea) { iFirstValidMarkupArea = iMUA; haveFirstValidMUA = true; } } return selectedzones->size(); } const char *CExplorerBar::writeNewBody(vector *chunks, vector *markupareas, vector *selectedzones, const char *body, const size_t bodysize) const { //caller frees result if not 0 vector::const_iterator i; //general cunks iterator vector::const_iterator iChunksBegin = chunks->begin(); vector::const_iterator iChunksEnd = chunks->end(); vector::const_iterator iCSS; //output body to replace the current body (with markup) const char *newbody = (const char*)mallocCheck(5 * 1024 * 1024); char *newpos = (char*) newbody; //also output the graphs at this stage (expandeable) textchunk tc; //current chunk markuparea mua; //current area size_t lastMarkupAreaID = 0; //so we know when it changes, to change colour bool bColor = false; //DHTML container strcpy(newpos, "+ page content analysis\
"); while (*newpos) newpos++; //TABLE strcpy(newpos, "\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "); while (*newpos) newpos++; for (i = iChunksBegin; i != iChunksEnd; i++) { //situation of chunk in relation to previous, next and area tc = *i; mua = markupareas->operator [](tc.markupAreaID); //TR and area change if (lastMarkupAreaID != tc.markupAreaID) bColor = !bColor; sprintf(newpos, "", (bColor ? "white" : "#d0d0d0")); while (*newpos) newpos++; //checkbox, text and href strcpy(newpos, ""); while (*newpos) newpos++; //csspath strcpy(newpos, ""); while (*newpos) newpos++; //markup area strcpy(newpos, ""); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; //depth strcpy(newpos, ""); while (*newpos) newpos++; //textLength strcpy(newpos, ""); while (*newpos) newpos++; //alphanumerics strcpy(newpos, ""); while (*newpos) newpos++; //wordCount strcpy(newpos, ""); while (*newpos) newpos++; //sentences strcpy(newpos, ""); while (*newpos) newpos++; //bytes //strcpy(newpos, ""); while (*newpos) newpos++; //text/byte density strcpy(newpos, ""); while (*newpos) newpos++; //a1/byte density strcpy(newpos, ""); while (*newpos) newpos++; //word/byte density strcpy(newpos, ""); while (*newpos) newpos++; //max area a1/byte density strcpy(newpos, ""); while (*newpos) newpos++; //include? strcpy(newpos, ""); while (*newpos) newpos++; //graph - plot all the values! strcpy(newpos, ""); while (*newpos) newpos++; lastMarkupAreaID = tc.markupAreaID; } strcpy(newpos, "
chunkcss
path
mrk
area
wrd
cnt
sentdepthtext
len
a1wrd
cnt
sentbyte
dens
a1
dens
word
dens
area
a1
dens
inc?graph
 "); while (*newpos) newpos++; sprintf(newpos, "[%u]", i - iChunksBegin); while (*newpos) newpos++; if (tc.headingTag) {sprintf(newpos, "<%s style=display:inline>", tc.headingTag->text()); while (*newpos) newpos++;} if (tc.hrefStart) { strcpy(newpos, ""); while (*newpos) newpos++; } switch (tc.type) { case chunk_image: { strcpy( newpos, ""); while (*newpos) newpos++; break; } case chunk_alt: { strcpy( newpos, "(alt):"); while (*newpos) newpos++; strncpy(newpos, tc.start, tc.textLength); newpos += tc.textLength; //does not include zero terminator break; } default: { strncpy(newpos, tc.start, tc.textLength); newpos += tc.textLength; //does not include zero terminator break; } } if (tc.hrefStart) {sprintf(newpos, ""); while (*newpos) newpos++;} if (tc.headingTag) {sprintf(newpos, " (%s)", tc.headingTag->text(), tc.headingTag->text()); while (*newpos) newpos++;} strcpy(newpos, ""); while (*newpos) newpos++; if (!lastMarkupAreaID || lastMarkupAreaID != tc.markupAreaID) { sprintf(newpos, "[%u] ", tc.markupAreaID); while (*newpos) newpos++; sprintf(newpos, "<%s>", mua.ender->text()); while (*newpos) newpos++; sprintf(newpos, "
%u-%u", mua.firstchunk, mua.lastchunk); while (*newpos) newpos++; } strcpy(newpos, "
"); while (*newpos) newpos++; if (lastMarkupAreaID != tc.markupAreaID) {sprintf(newpos, "%u", mua.wordCount); while (*newpos) newpos++;} strcpy(newpos, ""); while (*newpos) newpos++; if (lastMarkupAreaID != tc.markupAreaID) {sprintf(newpos, "%u", mua.sentences); while (*newpos) newpos++;} strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.depth); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.textLength); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.alphanumerics); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.wordCount); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.sentences); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; //sprintf(newpos, "%u", bytesLength); while (*newpos) newpos++; //strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%hi%%", tc.textDensity); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%hi%%", tc.a1Density); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.wordDensity); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%hi%%", mua.maxA1Density); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "", (tc.a1Density >= 40 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.wordCount >= 4 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.wordCount >= 20 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.sentences >= 2 ? "checked" : "")); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "
 
", tc.a1Density); while (*newpos) newpos++; sprintf(newpos, "
 
", 40); while (*newpos) newpos++; strcpy(newpos, "
"); while (*newpos) newpos++; strcpy(newpos, "
"); while (*newpos) newpos++; //------------------------------------------------------------- 5th pass: output the relevant text(s) strcpy(newpos, "+ page selected zones\
"); while (*newpos) newpos++; vector::const_iterator iZone; selectedzone sz; lastMarkupAreaID = 0; for (iZone = selectedzones->begin(); iZone != selectedzones->end(); iZone++) { sz = *iZone; //output chunks (need to output the last chunk too) i = iChunksBegin + markupareas->operator [](sz.firstMUA).firstchunk; do { tc = *i++; if (tc.headingTag) {sprintf(newpos, "<%s>", tc.headingTag->text()); while (*newpos) newpos++;} if (tc.hrefStart) { strcpy(newpos, ""); while (*newpos) newpos++; } switch (tc.type) { case chunk_image: { strcpy( newpos, ""); while (*newpos) newpos++; break; } default: { strncpy(newpos, tc.start, tc.textLength); newpos += tc.textLength; //does not include zero terminator break; } } if (tc.hrefStart) {sprintf(newpos, ""); while (*newpos) newpos++;} if (tc.headingTag) {sprintf(newpos, " (%s)", tc.headingTag->text(), tc.headingTag->text()); while (*newpos) newpos++;} lastMarkupAreaID = tc.markupAreaID; } while (i <= iChunksBegin + markupareas->operator [](sz.lastMUA).lastchunk); strcpy(newpos, "
"); while (*newpos) newpos++; //zone delimiter } strcpy(newpos, "
"); while (*newpos) newpos++; //------------------------------------------------------------- output origonal body //strcpy(newpos, body); //includes zero terminator //newpos += bodysize; *newpos = 0; return newbody; }