I am asking that if anyone out there had the same problem before, if they did what would likely be the cause of the problem for example "probably an access violation exception" ^^
@modoran - have you tried the spider example from chilkat?
@modoran - have you tried the spider example from chilkat?
Yes, that sample has a few errors in it, probably due to a recent API change. I tested the sample on linux (but it should work on windows without issues, don't have a windows machine right now to test) and it runs fine after fixing the errors:
#include <cstdio>
#include <CkSpider.h>
#include <CkStringArray.h>
#include <CkString.h>
void ChilkatSample(void);
int main() {
ChilkatSample();
return 0;
}
void ChilkatSample(void)
{
// The Chilkat Spider component/library is free.
CkSpider spider;
CkStringArray seenDomains;
CkStringArray seedUrls;
seenDomains.put_Unique(true);
seedUrls.put_Unique(true);
// You will need to change the start URL to something else...
seedUrls.Append("http://www.google.ro/");
// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the
// collection of outbound links.
spider.AddAvoidOutboundLinkPattern("*?id=*");
spider.AddAvoidOutboundLinkPattern("*.mypages.*");
spider.AddAvoidOutboundLinkPattern("*.personal.*");
spider.AddAvoidOutboundLinkPattern("*.comcast.*");
spider.AddAvoidOutboundLinkPattern("*.aol.*");
spider.AddAvoidOutboundLinkPattern("*~*");
// Use a cache so we don't have to re-fetch URLs previously fetched.
spider.put_CacheDir("/home/george/");
spider.put_FetchFromCache(true);
spider.put_UpdateCache(true);
while (seedUrls.get_Count() > 0) {
constchar * url;
url = seedUrls.pop();
spider.Initialize(url);
// Spider 5 URLs of this domain.
// but first, save the base domain in seenDomains
CkString domain;
domain.setStringAnsi (url);
//CkString s (url);
spider.get_Domain(domain);
seenDomains.Append(spider.getBaseDomain(domain));
long i;
bool success;
for (i = 0; i <= 4; i++) {
success = spider.CrawlNext();
if (success != true) {
break;
}
// Display the URL we just crawled.
printf("%s\n",spider.lastUrl());
// If the last URL was retrieved from cache,
// we won't wait. Otherwise we'll wait 1 second
// before fetching the next URL.
if (spider.get_LastFromCache() != true) {
spider.SleepMs(1000);
}
}
// Add the outbound links to seedUrls, except
// for the domains we've already seen.
for (i = 0; i <= spider.get_NumOutboundLinks() - 1; i++) {
url = spider.getOutboundLink(i);
domain = spider.getUrlDomain(url);
constchar * baseDomain;
baseDomain = spider.getBaseDomain(domain);
if (!seenDomains.Contains(baseDomain)) {
seedUrls.Append(url);
}
// Don't let our list of seedUrls grow too large.
if (seedUrls.get_Count() > 1000) {
break;
}
}
}
}