// // Word parsing algorithm // // Regex that strips out stuff that we don't want to parse boost::regex strip("(|<\\s*script.*?script\\s*>|&\\w+;|&#\\d+;)"); string data = boost::regex_replace(htmlData, strip, ""); // Try to extract the description from the title boost::regex title_exp("<\\s*title[^>]*>(.*?)<\\s*/\\s*title\\s*>"); boost::sregex_token_iterator a(data.begin(), data.end(), title_exp, 1); titleDescription = *a; // If we can't extract the title from the description, then look to the headers if (titleDescription == "") { boost::regex header_exp("<\\s*h(\\d)\\s*>(.*?)<\\s*/\\s*h\1\\s*>"); boost::sregex_token_iterator i(data.begin(), data.end(), header_exp, 2); boost::sregex_token_iterator j; while ((i != j) && (*i == "")) i++; headerDescription = *i; } // Extract the body from the html boost::regex body_filter("<\\s*body[^>]*>.*<\\s*/\\s*body\\s*>", boost::regex::normal | boost::regbase::icase); boost::sregex_token_iterator b(data.begin(), data.end(), body_filter, 0); string body = *b; // Find all of the characters between tags boost::regex tag_filter(">([^<>]*[^<>\\s]+[^<>]*)<", boost::regex::normal | boost::regbase::icase); boost::sregex_token_iterator k(body.begin(), body.end(), tag_filter, 1); boost::sregex_token_iterator l; string words = titleDescription; while (k != l) { words += " " + *k++; } // If we still don't have a description, store the first 100 characters // or just all of them, if there are fewer than 100 if (headerDescription == "" && titleDescription == "") { int count = 0; for (int i = 0; i < (count != 100 && words.size()); i++) { if (!isspace(words[i])) { bodyDescription += words[i]; count++; } } } // Set the Description page.setDescription(titleDescription + headerDescription + bodyDescription); // Parse out individual words boost::regex words_filter("(? for link parsing // //boost::sregex_token_iterator g(htmlData.begin(), htmlData.end(), html_filter, 1); //boost::regex html_filter("<\\s*html[^>]*>(.*?)<\\s*/\\s*html\\s*>", boost::regex::normal | boost::regbase::icase); //boost::sregex_token_iterator i(data.begin(), data.end(), html_filter, 1); // Filter out all links and add them to the queue boost::regex link_filter("<\\s*a\\s+[^>]*href\\s*=\\s*\'?\"?([^\'\" >]*)", boost::regex::normal|boost::regbase::icase); boost::sregex_token_iterator o(data.begin(), data.end(), link_filter, 1); boost::sregex_token_iterator p; while (o != p) { queue->add(URL(*o++,currentPath)); }