Boost logo

Boost-Commit :

From: daniel_james_at_[hidden]
Date: 2008-08-19 18:27:27


Author: danieljames
Date: 2008-08-19 18:27:26 EDT (Tue, 19 Aug 2008)
New Revision: 48239
URL: http://svn.boost.org/trac/boost/changeset/48239

Log:
Add link checking for css files.

Fixes #2173.

Text files modified:
   trunk/tools/inspect/link_check.cpp | 122 ++++++++++++++++++++++++++++-----------
   1 files changed, 86 insertions(+), 36 deletions(-)

Modified: trunk/tools/inspect/link_check.cpp
==============================================================================
--- trunk/tools/inspect/link_check.cpp (original)
+++ trunk/tools/inspect/link_check.cpp 2008-08-19 18:27:26 EDT (Tue, 19 Aug 2008)
@@ -15,10 +15,13 @@
 
 namespace
 {
- boost::regex url_regex(
+ boost::regex html_url_regex(
     "<\\s*[^>]*\\s+(?:HREF|SRC)" // HREF or SRC
     "\\s*=\\s*(['\"])(.*?)\\1",
     boost::regbase::normal | boost::regbase::icase);
+ boost::regex css_url_regex(
+ "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)",
+ boost::regbase::normal | boost::regbase::icase);
 
   // Regular expression for parsing URLS from:
   // http://tools.ietf.org/html/rfc3986#appendix-B
@@ -26,15 +29,36 @@
     "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$",
     boost::regbase::normal);
 
- // Decode percent encoded characters and html escapsed ampersands,
- // returns an empty string if there's an error.
- // The urls should really be fully HTML decoded at the beginning.
- std::string decode_url(std::string const& url_path) {
+ // Decode html escapsed ampersands, returns an empty string if there's an error.
+ std::string decode_ampersands(std::string const& url_path) {
+ std::string::size_type pos = 0, next;
+ std::string result;
+ result.reserve(url_path.length());
+
+ while((next = url_path.find('&', pos)) != std::string::npos) {
+ result.append(url_path, pos, next - pos);
+ pos = next;
+ if(url_path.substr(pos, 5) == "&amp;") {
+ result += '&'; pos += 5;
+ }
+ else {
+ result += '&'; pos += 1;
+ }
+ break;
+ }
+
+ result.append(url_path, pos, url_path.length());
+
+ return result;
+ }
+
+ // Decode percent encoded characters, returns an empty string if there's an error.
+ std::string decode_percents(std::string const& url_path) {
     std::string::size_type pos = 0, next;
     std::string result;
     result.reserve(url_path.length());
 
- while((next = url_path.find_first_of("&%", pos)) != std::string::npos) {
+ while((next = url_path.find('%', pos)) != std::string::npos) {
       result.append(url_path, pos, next - pos);
       pos = next;
       switch(url_path[pos]) {
@@ -47,15 +71,6 @@
           pos = next + 3;
           break;
         }
- case '&': {
- if(url_path.substr(pos, 5) == "&amp;") {
- result += '&'; pos += 5;
- }
- else {
- result += '&'; pos += 1;
- }
- break;
- }
       }
     }
 
@@ -64,6 +79,10 @@
     return result;
   }
 
+ bool is_css(const path & p) {
+ return p.extension() == ".css";
+ }
+
 } // unnamed namespace
 
 namespace boost
@@ -77,6 +96,9 @@
      : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0),
        m_bookmark_errors(0)
    {
+ // HTML signatures are already registered by the base class,
+ // 'hypertext_inspector'
+ register_signature(".css");
    }
 
 // inspect (all) -----------------------------------------------------------//
@@ -90,7 +112,7 @@
         m_paths[ relative_to( full_path, fs::initial_path() ) ] |= m_present;
     }
 
-// inspect ( .htm, .html ) -------------------------------------------------//
+// inspect ( .htm, .html, .shtml, .css ) -----------------------------------//
 
    void link_check::inspect(
       const string & library_name,
@@ -108,6 +130,9 @@
       boost::match_results< string::const_iterator > what;
       boost::match_flag_type flags = boost::match_default;
 
+ boost::regex const& url_regex =
+ is_css(full_path) ? css_url_regex : html_url_regex;
+
       while( boost::regex_search( start, end, what, url_regex, flags) )
       {
         // what[0] contains the whole string iterators.
@@ -132,12 +157,22 @@
         error( library_name, source_path, string(name()) + " empty URL." );
         return;
       }
+
+ // Decode ampersand encoded characters.
+ string decoded_url = is_css(source_path) ? url : decode_ampersands(url);
+ if(decoded_url.empty()) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ error( library_name, source_path, string(name()) + " invalid URL (invalid ampersand encodings): " + url );
+ }
+ return;
+ }
     
       boost::smatch m;
- if(!boost::regex_match(url, m, url_decompose_regex)) {
+ if(!boost::regex_match(decoded_url, m, url_decompose_regex)) {
         if(!no_link_errors) {
           ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL: " + url );
+ error( library_name, source_path, string(name()) + " invalid URL: " + decoded_url );
         }
         return;
       }
@@ -162,7 +197,7 @@
           if(!authority_matched) {
             if(!no_link_errors) {
               ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " no hostname: " + url );
+ error( library_name, source_path, string(name()) + " no hostname: " + decoded_url );
             }
           }
 
@@ -171,13 +206,19 @@
         else if(scheme == "file") {
           if(!no_link_errors) {
             ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (hardwired file): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (hardwired file): " + decoded_url );
+ }
+ }
+ else if(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript") {
+ if ( !no_link_errors && is_css(source_path) ) {
+ ++m_invalid_errors;
+ error( library_name, source_path, string(name()) + " invalid protocol for css: " + decoded_url );
           }
         }
- else if(!(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript")) {
+ else {
           if(!no_link_errors) {
             ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " unknown protocol: " + url );
+ error( library_name, source_path, string(name()) + " unknown protocol: " + decoded_url );
           }
         }
 
@@ -188,16 +229,24 @@
       if(authority_matched) {
         if(!no_link_errors) {
           ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (hostname without protocol): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (hostname without protocol): " + decoded_url );
         }
       }
 
       // Check the fragment identifier
- if(fragment_matched) {
- if ( !no_link_errors && fragment.find( '#' ) != string::npos )
- {
- ++m_bookmark_errors;
- error( library_name, source_path, string(name()) + " invalid bookmark: " + url );
+ if ( fragment_matched ) {
+ if ( is_css(source_path) ) {
+ if ( !no_link_errors ) {
+ ++m_invalid_errors;
+ error( library_name, source_path, string(name()) + " fragment link in CSS: " + decoded_url );
+ }
+ }
+ else {
+ if ( !no_link_errors && fragment.find( '#' ) != string::npos )
+ {
+ ++m_bookmark_errors;
+ error( library_name, source_path, string(name()) + " invalid bookmark: " + decoded_url );
+ }
         }
 
         // No more to do if it's just a fragment identifier
@@ -205,26 +254,26 @@
       }
 
       // Detect characters banned by RFC2396:
- if ( !no_link_errors && url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
+ if ( !no_link_errors && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
       {
         ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid character in URL: " + url );
+ error( library_name, source_path, string(name()) + " invalid character in URL: " + decoded_url );
       }
 
       // Check that we actually have a path.
       if(url_path.empty()) {
         if(!no_link_errors) {
           ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (empty path in relative url): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (empty path in relative url): " + decoded_url );
         }
       }
 
       // Decode percent and ampersand encoded characters.
- string decoded_path = decode_url(url_path);
+ string decoded_path = decode_percents(url_path);
       if(decoded_path.empty()) {
         if(!no_link_errors) {
           ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (invalid character encodings): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (invalid character encodings): " + decoded_url );
         }
         return;
       }
@@ -240,7 +289,7 @@
       {
         if(!no_link_errors) {
           ++m_invalid_errors;
- error( library_name, source_path, string(name()) + " invalid URL (error resolving path): " + url );
+ error( library_name, source_path, string(name()) + " invalid URL (error resolving path): " + decoded_url );
         }
         return;
       }
@@ -262,7 +311,7 @@
       if ( !no_link_errors && (itr->second & m_present) == 0 )
       {
         ++m_broken_errors;
- error( library_name, source_path, string(name()) + " broken link: " + url );
+ error( library_name, source_path, string(name()) + " broken link: " + decoded_url );
       }
     }
 
@@ -277,7 +326,8 @@
        if ( (itr->second & m_linked_to) != m_linked_to
          && (itr->second & m_nounlinked_errors) != m_nounlinked_errors
          && (itr->first.rfind( ".html" ) == itr->first.size()-5
- || itr->first.rfind( ".htm" ) == itr->first.size()-4)
+ || itr->first.rfind( ".htm" ) == itr->first.size()-4
+ || itr->first.rfind( ".css" ) == itr->first.size()-4)
          // because they may be redirectors, it is OK if these are unlinked:
          && itr->first.rfind( "index.html" ) == string::npos
          && itr->first.rfind( "index.htm" ) == string::npos )


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk