1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
|
#include <iostream>
#include <fstream>
#include <string>
#include <iterator>
#include <set>
#include <regex>
#include <urlmon.h> // standard windows header
#pragma comment( lib, "urlmon" ) // standard windows library
// URLDownloadToFileA - WinAPI function
bool download_file( std::string url, std::string path )
{ return URLDownloadToFileA( nullptr, url.c_str(), path.c_str(), 0, nullptr ) == S_OK ; }
std::string file_to_string( std::string file_name )
{
std::ifstream file(file_name) ;
return { std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>{} } ;
}
std::set<std::string> extract_hyperlinks( std::string html_file_name )
{
static const std::regex hl_regex( "<a href=\"(.*?)\">", std::regex_constants::icase ) ;
const std::string text = file_to_string(html_file_name) ;
return { std::sregex_token_iterator( text.begin(), text.end(), hl_regex, 1 ),
std::sregex_token_iterator{} } ;
}
template < typename ITERATOR, typename CALLABLE > auto apply_filter( ITERATOR begin, ITERATOR end, CALLABLE filter )
{
std::set< typename std::iterator_traits<ITERATOR>::value_type > result ;
for( ; begin != end ; ++begin ) if( filter(*begin) ) result.insert(*begin) ;
return result ;
}
int main()
{
const std::string url = "http://www.cplusplus.com" ; // adjust as required
const std::string path = "cplusplus.com.html" ; // adjust as required
const auto begins_with_forum = [] ( std::string str ) { return str.find( "/forum/" ) == 0 ; }; // adjust as required
if( download_file( url, path ) )
{
const auto hlinks = extract_hyperlinks(path) ;
const auto filtered_hlinks = apply_filter( std::begin(hlinks), std::end(hlinks), begins_with_forum ) ;
for( std::string hlink : filtered_hlinks ) std::cout << url + hlink << '\n' ;
}
}
|