-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHttpManager.cpp
More file actions
152 lines (126 loc) · 4.59 KB
/
HttpManager.cpp
File metadata and controls
152 lines (126 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include "HttpManager.h"
#include "downloader/Downloader.h"
#include "WebResourceInfo.h"
#include "LocalResourceInfo.h"
#include "registers/FileRegistry.h"
#include "UrlLocalResourceKey.h"
#include "parsers/RegexpParser.h"
#include "ILocalResourceKey.h"
#include "UrlLocalResourceKey.h"
#include <fstream>
#include <algorithm>
#include <string>
#include <deque>
#include <time.h>
#include <iostream>
namespace ua { namespace kiev { namespace ukma { namespace downloader {
HttpManager::HttpManager(const string link, int maxDeep, const string localDir):link(link), maxDeep(maxDeep), localDir(localDir)
{
init();
}
HttpManager::HttpManager(const string link, int maxDeep, const string localDir, const vector<IValidator*>& validators): link(link), maxDeep(maxDeep), localDir(localDir), validators(validators)
{
init();
}
void HttpManager::init()
{
//add '\' to the end if it doesn't exist
if(localDir!="" && localDir.rfind("\\")!=localDir.length()-1)localDir=localDir+"\\";
downloader=new downloaders::Downloader();
parser=new parsers::RegexpParser();
// Here we simulate SmartPtr - Registry will delete UrlLocalResourceKey
registry=new registers::FileRegistry("downloader.reg",new UrlLocalResourceKey());
}
HttpManager::~HttpManager()
{
delete parser;
delete registry;
delete downloader;
// Now keygen is deleted within FileRegistry
//delete keyGen;
}
HttpManager::HttpManager(const HttpManager& T) { }
void HttpManager::operator=(HttpManager T) { }
void HttpManager::run()
{
unsigned int i;
deque<pair<string,int> > linksQueue; //queue that is used for BFS exploring
// We create another instance of UrlLocalResourceKey
UrlLocalResourceKey * keygen = new UrlLocalResourceKey();
linksQueue.push_back(make_pair(link,1));
while(!linksQueue.empty())
{
string url=linksQueue[0].first;
int currentDeep=linksQueue[0].second;
linksQueue.pop_front();
cout<<"Deep of search:"<<currentDeep<<" trying to get "<<url<<endl;
WebResourceInfo webResource=downloader->getWebResourceInfo(url);
//call all the validators and if not valid skip this resource
bool valid=true;
for(i=0;i<validators.size();i++)valid=valid && validators[i]->isValid(webResource);
if(!valid)
{
cout<<"But validators say not to download!"<<endl;
continue;
}
// oldbam
LocalResourceInfo storedLocalResource = registry->getRecord(url);
if (storedLocalResource.getUrl() == webResource.getUrl()) {
cout<<"INFO:"<<"Resource exists in the registry"<<endl;
cout<<"URL:"<<storedLocalResource.getUrl()<<endl;
cout<<"DATE:"<<storedLocalResource.getDate()<<endl;
cout<<"LOCALPATH:"<<storedLocalResource.getLocalPath()<<endl;
continue;
}
// end oldbam
string hostName=getHostName(webResource);
//virtual path of the resource on the server
string resourceHostPath=getResourceDirs(webResource);
//change last '/' to '\'
if(resourceHostPath!="")
resourceHostPath[resourceHostPath.length()-1]='\\';
//local resource's path
string localResourceDir=localDir+hostName+"\\"+resourceHostPath;
//full resource's local file name(path + name)
string fullLocalFileName=localResourceDir+getLocalFileName(webResource);
downloader->makeDir(localResourceDir);
bool result = downloader->download(webResource,localResourceDir);
if (result == true) cout<<"Ok. Placed to: "<<fullLocalFileName<<endl;
else cout<<"ERROR : Downloader returned 'false' after download."<<endl;
//create and register LocalResource
char timeChar [19];
_strdate( timeChar );
timeChar[8]='#';
_strtime( timeChar+9 );
LocalResourceInfo localResource(webResource.getUrl(),(string)timeChar,fullLocalFileName);
registry->addRecord(keygen->getKey(localResource),localResource);
//parse and add new links to the queue
if(webResource.getMimeValue("Content-Type").find("text/html")!=string::npos && currentDeep<maxDeep)
{
string content,tmp;
ifstream file(fullLocalFileName.c_str());
while (getline(file, tmp))
{
content += tmp;
}
content = encodeUrl(content, ' ');
vector<string> links=parser->getLinks(content);
//deleting duplicates
sort(links.begin(),links.end());
links.erase(unique(links.begin(),links.end()),links.end());
parser->relative2Absolute(links,getHostName(webResource)+"/"+getResourceDirs(webResource));
// modified by oldbam, add cout for links
cout<<"INFO : New links are found by manager:"<<endl;
for(i=0;i<links.size();i++)
{
linksQueue.push_back(make_pair(links[i],currentDeep+1));
cout<<"LINK : "<<links[i]<<endl<<"DEPTH : "<<currentDeep+1<<endl;
}
cout<<endl;
// end modified by oldbam
}
cout<<endl;
}
delete keygen;
}
}}}}