mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-31 22:04:04 -06:00
reference: GuiWebCrawler
git-svn-id: svn://ultimatepp.org/upp/trunk@4792 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
parent
9ba808013f
commit
994ab898db
5 changed files with 178 additions and 2 deletions
155
reference/GuiWebCrawler/GuiWebCrawler.cpp
Normal file
155
reference/GuiWebCrawler/GuiWebCrawler.cpp
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
#include <CtrlLib/CtrlLib.h>
|
||||
|
||||
using namespace Upp;
|
||||
|
||||
#define LAYOUTFILE <GuiWebCrawler/GuiWebCrawler.lay>
|
||||
#include <CtrlCore/lay.h>
|
||||
|
||||
struct WebCrawler : public WithCrawlerLayout<TopWindow> {
|
||||
BiVector<int> todo;
|
||||
VectorMap<String, int> url;
|
||||
|
||||
struct Work {
|
||||
HttpRequest http;
|
||||
int urli;
|
||||
};
|
||||
Array<Work> http;
|
||||
int64 total;
|
||||
|
||||
void ExtractUrls(const String& html, int srci);
|
||||
void ShowPath();
|
||||
void OpenURL(ArrayCtrl *a);
|
||||
|
||||
typedef WebCrawler CLASSNAME;
|
||||
|
||||
public:
|
||||
void Run();
|
||||
|
||||
WebCrawler();
|
||||
};
|
||||
|
||||
bool IsUrlChar(int c)
|
||||
{
|
||||
return c == ':' || c == '.' || IsAlNum(c) || c == '_' || c == '%' || c == '/';
|
||||
}
|
||||
|
||||
void WebCrawler::ExtractUrls(const String& html, int srci)
|
||||
{
|
||||
int q = 0;
|
||||
while(q < html.GetCount()) {
|
||||
q = html.Find("http://", q);
|
||||
if(q < 0)
|
||||
return;
|
||||
int b = q;
|
||||
while(q < html.GetCount() && IsUrlChar(html[q]))
|
||||
q++;
|
||||
String u = html.Mid(b, q - b);
|
||||
if(url.Find(u) < 0) {
|
||||
todo.AddTail(url.GetCount());
|
||||
url.Add(u, srci);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WebCrawler::Run()
|
||||
{
|
||||
String seed = "www.ultimatepp.org";
|
||||
if(!EditText(seed, "GuiWebSpider", "Seed URL"))
|
||||
return;
|
||||
todo.AddTail(0);
|
||||
url.Add(seed);
|
||||
Open();
|
||||
while(IsOpen()) {
|
||||
ProcessEvents();
|
||||
while(todo.GetCount() && http.GetCount() < 60) {
|
||||
int i = todo.Head();
|
||||
todo.DropHead();
|
||||
Work& w = http.Add();
|
||||
w.urli = i;
|
||||
w.http.Url(url.GetKey(i))
|
||||
.UserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:11.0) Gecko/20100101 Firefox/11.0")
|
||||
.Timeout(0);
|
||||
work.Add(url.GetKey(i));
|
||||
work.HeaderTab(0).SetText(Format("URL (%d)", work.GetCount()));
|
||||
}
|
||||
SocketWaitEvent we;
|
||||
for(int i = 0; i < http.GetCount(); i++)
|
||||
we.Add(http[i].http);
|
||||
we.Wait(10);
|
||||
int i = 0;
|
||||
while(i < http.GetCount()) {
|
||||
Work& w = http[i];
|
||||
w.http.Do();
|
||||
String u = url.GetKey(w.urli);
|
||||
int q = work.Find(u);
|
||||
if(w.http.InProgress()) {
|
||||
if(q >= 0)
|
||||
work.Set(q, 1, w.http.GetPhaseName());
|
||||
i++;
|
||||
}
|
||||
else {
|
||||
String html = w.http;
|
||||
total += html.GetCount();
|
||||
finished.Add(u, w.http.IsError() ? String().Cat() << w.http.GetErrorDesc()
|
||||
: String().Cat() << w.http.GetStatusCode()
|
||||
<< ' ' << w.http.GetReasonPhrase()
|
||||
<< " (" << html.GetCount() << " bytes)",
|
||||
w.urli);
|
||||
finished.HeaderTab(0).SetText(Format("Finished (%d)", finished.GetCount()));
|
||||
finished.HeaderTab(1).SetText(Format("Response (%` KB)", total >> 10));
|
||||
if(w.http.IsSuccess()) {
|
||||
ExtractUrls(html, w.urli);
|
||||
Title(AsString(url.GetCount()) + " URLs found");
|
||||
}
|
||||
http.Remove(i);
|
||||
work.Remove(q);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WebCrawler::ShowPath()
|
||||
{
|
||||
path.Clear();
|
||||
if(!finished.IsCursor())
|
||||
return;
|
||||
int i = finished.Get(2);
|
||||
Vector<String> p;
|
||||
for(;;) {
|
||||
p.Add(url.GetKey(i));
|
||||
if(i == 0)
|
||||
break;
|
||||
i = url[i];
|
||||
}
|
||||
for(int i = p.GetCount() - 1; i >= 0; i--)
|
||||
path.Add(p[i]);
|
||||
}
|
||||
|
||||
void WebCrawler::OpenURL(ArrayCtrl *a)
|
||||
{
|
||||
String u = a->GetKey();
|
||||
WriteClipboardText(u);
|
||||
LaunchWebBrowser(u);
|
||||
}
|
||||
|
||||
WebCrawler::WebCrawler()
|
||||
{
|
||||
CtrlLayout(*this, "WebCrawler");
|
||||
work.AddColumn("URL");
|
||||
work.AddColumn("Status");
|
||||
finished.AddColumn("Finished");
|
||||
finished.AddColumn("Response");
|
||||
finished.WhenCursor = THISBACK(ShowPath);
|
||||
finished.WhenLeftDouble = THISBACK1(OpenURL, &finished);
|
||||
path.AddColumn("Path");
|
||||
path.WhenLeftDouble = THISBACK1(OpenURL, &path);
|
||||
total = 0;
|
||||
Zoomable().Sizeable();
|
||||
}
|
||||
|
||||
GUI_APP_MAIN
|
||||
{
|
||||
HttpRequest::Trace();
|
||||
|
||||
WebCrawler().Run();
|
||||
}
|
||||
6
reference/GuiWebCrawler/GuiWebCrawler.lay
Normal file
6
reference/GuiWebCrawler/GuiWebCrawler.lay
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
LAYOUT(CrawlerLayout, 680, 508)
|
||||
ITEM(ArrayCtrl, work, LeftPosZ(4, 356).TopPosZ(4, 500))
|
||||
ITEM(ArrayCtrl, finished, LeftPosZ(364, 312).TopPosZ(4, 324))
|
||||
ITEM(ArrayCtrl, path, LeftPosZ(364, 312).TopPosZ(332, 172))
|
||||
END_LAYOUT
|
||||
|
||||
11
reference/GuiWebCrawler/GuiWebCrawler.upp
Normal file
11
reference/GuiWebCrawler/GuiWebCrawler.upp
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
uses
|
||||
CtrlLib,
|
||||
Core/SSL;
|
||||
|
||||
file
|
||||
GuiWebCrawler.cpp,
|
||||
GuiWebCrawler.lay;
|
||||
|
||||
mainconfig
|
||||
"" = "GUI SSE2";
|
||||
|
||||
5
reference/GuiWebCrawler/init
Normal file
5
reference/GuiWebCrawler/init
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
#ifndef _GuiWebCrawler_icpp_init_stub
|
||||
#define _GuiWebCrawler_icpp_init_stub
|
||||
#include "CtrlLib/init"
|
||||
#include "Core/SSL/init"
|
||||
#endif
|
||||
|
|
@ -12,7 +12,7 @@ CONSOLE_APP_MAIN
|
|||
{
|
||||
MySqlSession session;
|
||||
// edit the connection parameters if necessary
|
||||
if(session.Connect("root", "koblih", "test")) {
|
||||
if(session.Connect("root", "Passw0rd", "test")) {
|
||||
Cout() << "Connected\n";
|
||||
SQL = session;
|
||||
|
||||
|
|
@ -43,5 +43,4 @@ CONSOLE_APP_MAIN
|
|||
Cerr() <<"ERROR: Unable to connect to database\n";
|
||||
SetExitCode(1);
|
||||
}
|
||||
SetExitCode(0);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue