1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
|
#include <iostream>
#include <fstream>
#include <string>
#include <iterator>
#include <set>
#include <regex>
#include <urlmon.h> // standard windows header
#pragma comment( lib, "urlmon" ) // standard windows library
// URLDownloadToFileA - WinAPI function
bool download_file( std::string url, std::string path )
{ return URLDownloadToFileA( nullptr, url.c_str(), path.c_str(), 0, nullptr ) == S_OK ; }
std::string file_to_string( std::string file_name )
{
std::ifstream file(file_name) ;
std::istreambuf_iterator<char> begin(file) ;
std::istreambuf_iterator<char> end ;
return std::string( begin, end ) ;
}
std::set<std::string> extract_hyperlinks( std::string html_file_name )
{
static const std::regex hl_regex( "<a href=\"(.*?)\">", std::regex_constants::icase ) ;
const std::string text = file_to_string(html_file_name) ;
std::sregex_token_iterator begin( text.begin(), text.end(), hl_regex, 1 );
std::sregex_token_iterator end ;
return std::set<std::string>( begin, end ) ;
}
template < typename ITERATOR, typename CALLABLE > auto apply_filter( ITERATOR begin, ITERATOR end, CALLABLE filter )
{
std::set< typename std::iterator_traits<ITERATOR>::value_type > result ;
for( ; begin != end ; ++begin ) if( filter( *begin ) ) result.insert( *begin ) ;
return result ;
}
int main()
{
const std::string url = "http://www.cplusplus.com" ; // adjust as required
const std::string path = "cplusplus.com.html" ; // adjust as required
const auto begins_with_forum = [] ( std::string str ) { return str.find( "/forum/" ) == 0 ; }; // adjust as required
if( download_file( url, path ) )
{
const auto hlinks = extract_hyperlinks( path ) ;
const auto filtered_hlinks = apply_filter( hlinks.begin(), hlinks.end(), begins_with_forum ) ;
for( auto iter = filtered_hlinks.begin() ; iter != filtered_hlinks.end() ; ++iter ) std::cout << url + *iter << '\n' ;
}
}
|