|
Boost-Commit : |
From: daniel_james_at_[hidden]
Date: 2008-08-19 18:27:27
Author: danieljames
Date: 2008-08-19 18:27:26 EDT (Tue, 19 Aug 2008)
New Revision: 48239
URL: http://svn.boost.org/trac/boost/changeset/48239
Log:
Add link checking for css files.
Fixes #2173.
Text files modified:
trunk/tools/inspect/link_check.cpp | 122 ++++++++++++++++++++++++++++-----------
1 files changed, 86 insertions(+), 36 deletions(-)
Modified: trunk/tools/inspect/link_check.cpp
==============================================================================
--- trunk/tools/inspect/link_check.cpp (original)
+++ trunk/tools/inspect/link_check.cpp 2008-08-19 18:27:26 EDT (Tue, 19 Aug 2008)
@@ -15,10 +15,13 @@
namespace
{
- boost::regex url_regex(
+ boost::regex html_url_regex(
"<\\s*[^>]*\\s+(?:HREF|SRC)" // HREF or SRC
"\\s*=\\s*(['\"])(.*?)\\1",
boost::regbase::normal | boost::regbase::icase);
+ boost::regex css_url_regex(
+ "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)",
+ boost::regbase::normal | boost::regbase::icase);
// Regular expression for parsing URLS from:
// http://tools.ietf.org/html/rfc3986#appendix-B
@@ -26,15 +29,36 @@
"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$",
boost::regbase::normal);
- // Decode percent encoded characters and html escapsed ampersands,
- // returns an empty string if there's an error.
- // The urls should really be fully HTML decoded at the beginning.
- std::string decode_url(std::string const& url_path) {
+ // Decode html escapsed ampersands, returns an empty string if there's an error.
+ std::string decode_ampersands(std::string const& url_path) {
+ std::string::size_type pos = 0, next;
+ std::string result;
+ result.reserve(url_path.length());
+
+ while((next = url_path.find('&', pos)) != std::string::npos) {
+ result.append(url_path, pos, next - pos);
+ pos = next;
+ if(url_path.substr(pos, 5) == "&") {
+ result += '&'; pos += 5;
+ }
+ else {
+ result += '&'; pos += 1;
+ }
+ break;
+ }
+
+ result.append(url_path, pos, url_path.length());
+
+ return result;
+ }
+
+ // Decode percent encoded characters, returns an empty string if there's an error.
+ std::string decode_percents(std::string const& url_path) {
std::string::size_type pos = 0, next;
std::string result;
result.reserve(url_path.length());
- while((next = url_path.find_first_of("&%", pos)) != std::string::npos) {
+ while((next = url_path.find('%', pos)) != std::string::npos) {
result.append(url_path, pos, next - pos);
pos = next;
switch(url_path[pos]) {
@@ -47,15 +71,6 @@
pos = next + 3;
break;
}
- case '&': {
- if(url_path.substr(pos, 5) == "&") {
- result += '&'; pos += 5;
- }
- else {
- result += '&'; pos += 1;
- }
- break;
- }
}
}
@@ -64,6 +79,10 @@
return result;
}
+ bool is_css(const path & p) {
+ return p.extension() == ".css";
+ }
+
} // unnamed namespace
namespace boost
@@ -77,6 +96,9 @@
: m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0),
m_bookmark_errors(0)
{
+ // HTML signatures are already registered by the base class,
+ // 'hypertext_inspector'
+ register_signature(".css");
}
// inspect (all) -----------------------------------------------------------//
@@ -90,7 +112,7 @@
m_paths[ relative_to( full_path, fs::initial_path() ) ] |= m_present;
}
-// inspect ( .htm, .html ) -------------------------------------------------//
+// inspect ( .htm, .html, .shtml, .css ) -----------------------------------//
void link_check::inspect(
const string & library_name,
@@ -108,6 +130,9 @@
boost::match_results< string::const_iterator > what;
boost::match_flag_type flags = boost::match_default;
+ boost::regex const& url_regex =
+ is_css(full_path) ? css_url_regex : html_url_regex;
+
while( boost::regex_search( start, end, what, url_regex, flags) )
{
// what[0] contains the whole string iterators.
@@ -132,12 +157,22 @@
error( library_name, source_path, string(name()) + " empty URL." );
return;
}
+
+ // Decode ampersand encoded characters.
+ string decoded_url = is_css(source_path) ? url : decode_ampersands(url);
+ if(decoded_url.empty()) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ error( library_name, source_path, string(name()) + " invalid URL (invalid ampersand encodings): " + url );
+ }
+ return;
+ }
boost::smatch m;
- if(!boost::regex_match(url, m, url_decompose_regex)) {
+ if(!boost::regex_match(decoded_url, m, url_decompose_regex)) {
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL: " + url );
+ error( library_name, source_path, string(name()) + " invalid URL: " + decoded_url );
}
return;
}
@@ -162,7 +197,7 @@
if(!authority_matched) {
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " no hostname: " + url );
+ error( library_name, source_path, string(name()) + " no hostname: " + decoded_url );
}
}
@@ -171,13 +206,19 @@
else if(scheme == "file") {
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (hardwired file): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (hardwired file): " + decoded_url );
+ }
+ }
+ else if(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript") {
+ if ( !no_link_errors && is_css(source_path) ) {
+ ++m_invalid_errors;
+ error( library_name, source_path, string(name()) + " invalid protocol for css: " + decoded_url );
}
}
- else if(!(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript")) {
+ else {
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " unknown protocol: " + url );
+ error( library_name, source_path, string(name()) + " unknown protocol: " + decoded_url );
}
}
@@ -188,16 +229,24 @@
if(authority_matched) {
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (hostname without protocol): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (hostname without protocol): " + decoded_url );
}
}
// Check the fragment identifier
- if(fragment_matched) {
- if ( !no_link_errors && fragment.find( '#' ) != string::npos )
- {
- ++m_bookmark_errors;
- error( library_name, source_path, string(name()) + " invalid bookmark: " + url );
+ if ( fragment_matched ) {
+ if ( is_css(source_path) ) {
+ if ( !no_link_errors ) {
+ ++m_invalid_errors;
+ error( library_name, source_path, string(name()) + " fragment link in CSS: " + decoded_url );
+ }
+ }
+ else {
+ if ( !no_link_errors && fragment.find( '#' ) != string::npos )
+ {
+ ++m_bookmark_errors;
+ error( library_name, source_path, string(name()) + " invalid bookmark: " + decoded_url );
+ }
}
// No more to do if it's just a fragment identifier
@@ -205,26 +254,26 @@
}
// Detect characters banned by RFC2396:
- if ( !no_link_errors && url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
+ if ( !no_link_errors && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
{
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid character in URL: " + url );
+ error( library_name, source_path, string(name()) + " invalid character in URL: " + decoded_url );
}
// Check that we actually have a path.
if(url_path.empty()) {
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (empty path in relative url): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (empty path in relative url): " + decoded_url );
}
}
// Decode percent and ampersand encoded characters.
- string decoded_path = decode_url(url_path);
+ string decoded_path = decode_percents(url_path);
if(decoded_path.empty()) {
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (invalid character encodings): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (invalid character encodings): " + decoded_url );
}
return;
}
@@ -240,7 +289,7 @@
{
if(!no_link_errors) {
++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (error resolving path): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (error resolving path): " + decoded_url );
}
return;
}
@@ -262,7 +311,7 @@
if ( !no_link_errors && (itr->second & m_present) == 0 )
{
++m_broken_errors;
- error( library_name, source_path, string(name()) + " broken link: " + url );
+ error( library_name, source_path, string(name()) + " broken link: " + decoded_url );
}
}
@@ -277,7 +326,8 @@
if ( (itr->second & m_linked_to) != m_linked_to
&& (itr->second & m_nounlinked_errors) != m_nounlinked_errors
&& (itr->first.rfind( ".html" ) == itr->first.size()-5
- || itr->first.rfind( ".htm" ) == itr->first.size()-4)
+ || itr->first.rfind( ".htm" ) == itr->first.size()-4
+ || itr->first.rfind( ".css" ) == itr->first.size()-4)
// because they may be redirectors, it is OK if these are unlinked:
&& itr->first.rfind( "index.html" ) == string::npos
&& itr->first.rfind( "index.htm" ) == string::npos )
Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk