浏览 12491 次
锁定老帖子 主题:开放源码的网络蜘蛛
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
|
|
---|---|
作者 | 正文 |
发表时间:2007-11-07
开发环境:Visual Studio 2005 语言:C++/CLI 作品所要解决的主要问题: 对于给定的网站列表,下载其所有网页到数据库,可以限制抓取深度和吞吐量。 存储的信息包括标题、内容、网页大小、抓取时间、链接数、连接数等等 。 作品的主要应用场景: 可用于数据采集、数据挖掘以及搜索引擎的前期工作。 基于IOCP模型构建,稳定性有保证。 而且有详细的开发文档。 使用说明: 1.直接点击bin目录下的WebSpiderEh.exe,即可开始抓取网站。 2.bin\db.mdb中的sites表配置您要抓取的网站,pages表保存抓取的结果。 3.maxDepth.txt中的数字控制抓取的深度。 4.throughput.txt中的数字控制蜘蛛的吞吐量,一般不用修改,如果您的网速很快,可以将数字调大一点。 谢谢使用,欢迎提出宝贵意见! 如果您下载后觉得好用,或者觉得源码对您有借鉴的价值,请投出您宝贵的一票。 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
推荐链接
|
|
返回顶楼 | |
发表时间:2007-11-07
// WebSpiderEh.cpp: 主项目文件。 #include "stdafx.h" #include "InitSock.h" #include "DataUnit.h" #include "HTMLDoc.h" #include "UrlQueue.h" #include "stdio.h" #include "algorithm" #include "map" #include "string" #include "iostream" #define BUFSMALL 1024 //小缓冲区的大小(一次接收) #define BUFBIG 600000 //大缓冲区的大小(整个网页) #define PRONUMBER 2 //并发进程数 #define TIME_OUT_TIME 1 //connect超时时间1秒 #define TIMEOUT_IDLE 500 //防死机*秒 #define TIMEOUT_IOCP INFINITE //IOCP查询周期 #define UPDATE_CYCLE 3 //热门关键词更新的周期(天数) //#define DBG using namespace System; using namespace System::Net; using namespace System::Text; using namespace System::IO; using namespace System::Text::RegularExpressions; using namespace System::Runtime::InteropServices; using namespace System::Collections::Generic; using namespace System::Data; using namespace System::Data::OleDb; using namespace System::Threading; using namespace System::Diagnostics; using namespace std; char * strtochar(String ^s){//将托管String^转化为Native Char* return ( char *)Marshal::StringToHGlobalAnsi(s).ToPointer(); } string Str2str(String^ s){//将托管String^转化为标准string char *ptr=( char *)Marshal::StringToHGlobalAnsi(s).ToPointer(); ptr[s->Length]='\0'; return string(ptr); } String^ strtomd5(String^ text){ return System::Web::Security::FormsAuthentication::HashPasswordForStoringInConfigFile(text,"md5"); } ref class Global{ public: //g_visited是全局变量,记录访问过的网页,避免重复访问 static Dictionary<String^,int> g_visited; //g_titleused是全局变量,记录出现过的标题,以达到标题去重的效果 static Dictionary<String^,int> g_titleused; //static IndexWriter^ g_writer; //g_priorities 保存每个网站的优先级 static Dictionary<String^,int> g_priorities; static OleDbConnection^ g_conn; static int postAmount=0; static const int throughput=int::Parse(File::OpenText("throughput.txt")->ReadLine()); static const int max_depth=int::Parse(File::OpenText("maxDepth.txt")->ReadLine()); }; // 初始化Winsock库 CInitSock theSock; HANDLE hCompletion;//完成端口 //控制IO查询线程是否继续工作 bool g_iogo; typedef struct _PER_HANDLE_DATA // per-handle数据 { SOCKET s; // 对应的套节字句柄 sockaddr_in addr; // 服务器地址 } PER_HANDLE_DATA, *PPER_HANDLE_DATA; typedef struct _PER_IO_DATA // per-I/O数据 { OVERLAPPED ol; // 重叠结构 char buf[BUFBIG]; // 数据缓冲区 int nOperationType; // 操作类型 char url[BUFSMALL]; //网址 int top; //缓冲区指针 #define OP_READ 1 #define OP_WRITE 2 } PER_IO_DATA, *PPER_IO_DATA; String^ getDomain(String^ url){ Uri^ uri=gcnew Uri(url); String^ host=uri->Host; int i=host->IndexOf("."); return host->Substring(i+1); } void PostGet(String^ url){ try{ String^ urlflag=url->ToLower(); if(Global::g_visited.ContainsKey(urlflag))return ; if(url->Length>255)return ; if(url->EndsWith(".zip") || url->EndsWith(".gz") || url->EndsWith(".rar") || url->EndsWith(".exe") || url->EndsWith(".exe") || url->EndsWith(".jpg") || url->EndsWith(".png") || url->EndsWith(".tar") || url->EndsWith(".chm") || url->EndsWith(".iso")|| url->EndsWith(".gif") || url->EndsWith(".csv") || url->EndsWith(".pdf") || url->EndsWith(".doc")) return; SOCKET cs; Uri uri(url); String^ host=uri.Host; //域名 String^ destWeb=uri.PathAndQuery; //绝对URL路径 String^ IP=""; IP=Dns::GetHostEntry(host)->AddressList[0]->ToString(); //IP地址 int port=uri.Port; //端口 SOCKADDR_IN servAddr; servAddr.sin_family=AF_INET; servAddr.sin_addr.S_un.S_addr=inet_addr(strtochar(IP)); servAddr.sin_port=htons(port); cs=::socket(AF_INET,SOCK_STREAM,IPPROTO_TCP); if(INVALID_SOCKET==cs){ Console::WriteLine(L"创建套接字失败"); return ; } if(::connect(cs,(SOCKADDR*)&servAddr,sizeof(SOCKADDR))==SOCKET_ERROR){ Console::WriteLine(WSAGetLastError()); return ; } /*int error=-1, len; len = sizeof(int); timeval tm; fd_set set; unsigned long ul = 1; ioctlsocket(cs, FIONBIO, &ul); //设置为非阻塞模式 bool ret = false; if( connect(cs, (struct sockaddr *)&servAddr, sizeof(servAddr)) == -1) { tm.tv_sec = TIME_OUT_TIME; tm.tv_usec = 0; FD_ZERO(&set); FD_SET(cs, &set); if( select(cs+1, NULL, &set, NULL, &tm) > 0) { getsockopt(cs, SOL_SOCKET, SO_ERROR, (char *)&error, &len); if(error == 0) ret = true; else ret = false; } else ret = false; } else ret = true; ul = 0; ioctlsocket(cs, FIONBIO, &ul); //设置为阻塞模式 if(!ret){ closesocket(cs); return ; }*/ char getstr[BUFSMALL]={0}; sprintf_s(getstr,"GET %s HTTP/1.0\r\nHost: %s\r\nAccept: */*\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)\r\nReferer: %s\r\nPragma: no-cathe\r\nCathe-Control: no-cathe\r\nConnection: close\r\n\r\n",strtochar(destWeb),strtochar(host),strtochar(url)); //取得HTTP报头 int sendlen=strlen(getstr); ::send(cs,getstr,sendlen,0); //Console::WriteLine(url); // 建立新连接之后,为它创建一个per-handle数据,并将它们关联到完成端口对象。 PPER_HANDLE_DATA pPerHandle = (PPER_HANDLE_DATA)::GlobalAlloc(GPTR, sizeof(PER_HANDLE_DATA)); pPerHandle->s = cs; memcpy(&pPerHandle->addr, &servAddr, sizeof(servAddr)); ::CreateIoCompletionPort((HANDLE)pPerHandle->s, hCompletion, (DWORD)pPerHandle, 0); // 投递一个接收请求 PPER_IO_DATA pPerIO = (PPER_IO_DATA)::GlobalAlloc(GPTR, sizeof(PER_IO_DATA)); pPerIO->buf[0]='\0'; pPerIO->nOperationType=OP_READ; pPerIO->top=0; memcpy(pPerIO->url,strtochar(url),url->Length); WSABUF buf; buf.buf=pPerIO->buf; buf.len=BUFSMALL; DWORD nFlags=0; DWORD dwTrans=0; ::WSARecv(pPerHandle->s, &buf, 1, &dwTrans, &nFlags, &pPerIO->ol, NULL); Global::g_visited[urlflag]=1; //标志为访问过了 Global::postAmount++; //Console::WriteLine(Global::postAmount); //Console::WriteLine(UrlQueue::g_urlqueue->size()); //return 1; }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; //System::GC::Collect(); //System::GC::WaitForPendingFinalizers(); //return 0; } } //addpage把分页好的网页数据暂存到access数据库中去 int addPage(String^ URL,String^ Title,String^ PlainText,String^ LinkText,int Size,String^ Host,int OutDegree,int Priority){ OleDbConnection^ conn=Global::g_conn; if(conn==nullptr || conn->State!=System::Data::ConnectionState::Open){ String^ connstr="Provider=Microsoft.Jet.OLEDB.4.0;Data Source="+System::AppDomain::CurrentDomain->BaseDirectory+"\\db.mdb"; //Console::WriteLine(connstr); conn=gcnew OleDbConnection(connstr); conn->Open(); Global::g_conn=conn; } try{ Regex^ reg=gcnew Regex("\\W{1}(?<tt>[0-9a-zA-Z_:,、;《》“”\u4e00-\u9fa5]{10,30})\\W"); String^ tmp=""; if(!Global::g_titleused.ContainsKey(Title)){ Global::g_titleused[Title]=1; } else{ Match^ mc=reg->Match(PlainText); if(mc->Success){ tmp=mc->Groups["tt"]->ToString(); while(Global::g_titleused.ContainsKey(tmp)){ mc=mc->NextMatch(); if(!mc->Success){ tmp=Title; break; } tmp=mc->Groups["tt"]->ToString(); } if(Title->Equals(tmp))Priority--; Title=tmp; Global::g_titleused[Title]=1; } } if(("http://"+Host)->ToLower()==URL->ToLower())Priority+=15; if(("http://"+Host+"/")->ToLower()==URL->ToLower())Priority+=15; if(("http://"+Host)->ToLower()==URL->ToLower() && URL->Split('.')->Length<=4)Priority+=15; if(("http://"+Host+"/")->ToLower()==URL->ToLower() && URL->Split('.')->Length<=4)Priority+=15; if(("http://www."+getDomain(URL))->ToLower()==URL->ToLower())Priority++; if(("http://www."+getDomain(URL)+"/")->ToLower()==URL->ToLower())Priority++; //String^ connstr="provider=microsoft.jet.oledb.4.0;data source=" + AppDomain::CurrentDomain::get()->BaseDirectory + "\\db.mdb"; OleDbDataAdapter^ da=gcnew OleDbDataAdapter("select 1 from pages where [url]='"+URL+"'",conn); DataSet^ ds=gcnew DataSet(); da->Fill(ds); if(ds->Tables[0]->Rows->Count>0){ //delete conn; return 1; } StringBuilder^ sb=gcnew StringBuilder(); sb->Append("insert into Pages([URL],[Title],[PlainText],[LinkText],[Size],[Host],[UpdateTime],[OutDegree],[Priority]) values('"); sb->Append(URL)->Append("','"); sb->Append(Title)->Append("','"); sb->Append(PlainText)->Append("','"); sb->Append(LinkText)->Append("',"); sb->Append(Size)->Append(",'"); sb->Append(Host)->Append("','"); sb->Append(DateTime::Now.ToString())->Append("',"); sb->Append(OutDegree.ToString())->Append(","); sb->Append(Priority.ToString())->Append(")"); OleDbCommand^ cmd=gcnew OleDbCommand(); cmd->Connection=conn; cmd->CommandText=sb->ToString(); cmd->ExecuteNonQuery(); //conn->Close(); //delete conn; delete sb; delete PlainText; delete URL; delete Title; delete LinkText; delete Host; /*HANDLE m_hMutex = OpenMutex( MUTEX_ALL_ACCESS, // request full access FALSE, // handle not inheritable L"mymutex"); // object name if(m_hMutex==NULL){ m_hMutex = CreateMutex(NULL, TRUE, L"mymutex"); } else{ WaitForSingleObject(m_hMutex,INFINITE); } String^ indexpath=AppDomain::CurrentDomain::get()->BaseDirectory + "\\index"; IndexWriter^ writer; try{ writer=gcnew IndexWriter(indexpath,gcnew Standard::StandardAnalyzer(),false); } catch(Exception^ ex){ delete ex; writer=gcnew IndexWriter(indexpath,gcnew Standard::StandardAnalyzer(),true); } IndexReader^ reader=IndexReader::Open(indexpath); reader->Delete(gcnew Term("url",URL)); reader->Close(); Document^ doc=gcnew Document(); doc->Add(gcnew Field("title",Title,Field::Store::YES,Field::Index::TOKENIZED,Field::TermVector::WITH_POSITIONS_OFFSETS)); doc->Add(gcnew Field("context", PlainText, Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)); doc->Add(Field::UnIndexed("host", Host)); doc->Add(Field::UnIndexed("page_size", Size.ToString())); doc->Add(gcnew Field("update_time", DateTime::Now.ToString(),Field::Store::YES,Field::Index::UN_TOKENIZED)); doc->Add(gcnew Field("priority", Priority.ToString(),Field::Store::YES,Field::Index::UN_TOKENIZED)); doc->Add(Field::Keyword("url", URL)); writer->AddDocument(doc); writer->Close(); ::ReleaseMutex(m_hMutex);*/ return 0; }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; //delete conn; delete PlainText; delete URL; delete Title; delete LinkText; delete Host; //System::GC::Collect(); //System::GC::WaitForPendingFinalizers(); return 1; } //System::GC::Collect(); //System::GC::WaitForPendingFinalizers(); } int WebCut(LPVOID lpParam){ //分解并保存网页的函数 PPER_IO_DATA pPerIO=(PPER_IO_DATA)lpParam; char* start=NULL; //正文开始的指针 char* ix=NULL,*ix2=NULL; //"charset="这个字符串的位置 char cdbuf[20]={0};//编码方式 int cdbuftop=0; char key1[]="\r\n\r\n"; char key2[]="charset="; char key3[]="encoding="; try{ start=std::search(pPerIO->buf,pPerIO->buf+pPerIO->top,key1,key1+strlen(key1)); if(start==pPerIO->buf+pPerIO->top)start=pPerIO->buf; ix=std::search(pPerIO->buf,pPerIO->buf+pPerIO->top,key2,key2+strlen(key2)); ix2=std::search(pPerIO->buf,pPerIO->buf+pPerIO->top,key3,key3+strlen(key3)); if(ix!=(pPerIO->buf+pPerIO->top)){ ix+=strlen("charset="); while(*ix!='"' && *ix!='\n' && *ix!='\r' && cdbuftop<20){ cdbuf[cdbuftop++]=*ix; ix++; } cdbuf[cdbuftop]='\0'; } else if(ix2!=pPerIO->buf+pPerIO->top){ ix2+=strlen("encoding="); String^ str=gcnew String(ix2,0,12,Encoding::Default); Regex reg("(?<CDING>(UTF-8)|(GB2312))",RegexOptions::IgnoreCase); Match^ mc=reg.Match(str); String^ tempcd=mc->Groups["CDING"]->ToString(); memcpy(cdbuf,strtochar(tempcd),tempcd->Length); } else{ strcpy_s(cdbuf,"GB2312"); } String^ cding=% String(cdbuf); if(cding->ToUpper()=="UTF8")cding="UTF-8"; String^ html; int htmllen=strlen(start); while(htmllen<BUFBIG-1 && (start[htmllen+1]!='\0' || start[htmllen+2]!='\0')){ //Console::WriteLine(htmllen); htmllen+=strlen(start+htmllen+1)+1; if(htmllen>=BUFBIG){ htmllen=BUFBIG; break; } } //Console::WriteLine(htmllen); try{ html=gcnew String(start,0,htmllen,Encoding::GetEncoding(cding));//网页正文 }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; //System::GC::Collect(); //System::GC::WaitForPendingFinalizers(); html=gcnew String(start,0,pPerIO->top,Encoding::Default);//网页正文 } HTMLDoc^ doc=gcnew HTMLDoc(html,%String(pPerIO->url));//网页分析模板 ArrayList^ al=doc->GetInnerLinks(); //获得所有站内链接 for(int i=0;i<al->Count;i++){ String^ urlflag=((String^)al[i])->ToLower() ; if(!Global::g_visited.ContainsKey(urlflag)){ if(urlflag->EndsWith("/")){ int ix=urlflag->LastIndexOf("/"); if(Global::g_visited.ContainsKey(urlflag->Substring(0,ix)))continue; } try{ if(Uri(al[i]->ToString()).Segments->Length<=Global::max_depth){ UrlQueue::g_urlqueue->push(gcnew UrlAtom((String^)al[i]));//放入网址队列 } } catch(Exception^ ex){ delete ex; } } } String^ URL=gcnew String(pPerIO->url); if(Global::g_priorities.ContainsKey(getDomain(URL))){ addPage(URL,doc->GetTitle(),doc->GetPlainText(),doc->GetLinkText(),doc->GetSize(),Uri(URL).Host,al->Count,Global::g_priorities[getDomain(URL)]); Console::WriteLine(URL); //打印该完成页的网址 } else{ addPage(URL,doc->GetTitle(),doc->GetPlainText(),doc->GetLinkText(),doc->GetSize(),Uri(URL).Host,al->Count,0); Console::WriteLine(URL); //打印该完成页的网址 } } catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; //System::GC::Collect(); //System::GC::WaitForPendingFinalizers(); } return 0; } //判断主机是否相关 bool hostrelated(String^ a,String^ b){ try{ Match^ mc1=Regex("\\.([^.]+)\\.").Match(a); Match^ mc2=Regex("\\.([^.]+)\\.").Match(b); if(mc1->Success && mc2->Success && mc1->Groups->Count>1 && mc2->Groups->Count>1 && mc1->Groups[1]->Value->ToLower()==mc2->Groups[1]->Value->ToLower()){ return true; } else return false; } catch(Exception^ ex) { #ifdef DBG Console::WriteLine(ex.Message); #endif delete ex; return false; } } //ServerThread线程函数用于查询和操作完成端口 DWORD WINAPI ServerThread(LPVOID lpParam) { // 得到完成端口对象句柄 HANDLE hCompletion = (HANDLE)lpParam; DWORD dwTrans; PPER_HANDLE_DATA pPerHandle; PPER_IO_DATA pPerIO; while(g_iogo) { // 在关联到此完成端口的所有套节字上等待I/O完成 BOOL bOK = ::GetQueuedCompletionStatus(hCompletion, &dwTrans, (LPDWORD)&pPerHandle, (LPOVERLAPPED*)&pPerIO, TIMEOUT_IOCP); try{ if(!bOK) // 在此套节字上有错误发生 { try{ if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET) ::closesocket(pPerHandle->s); if(pPerHandle!=NULL) ::GlobalFree(pPerHandle); if(pPerIO!=NULL) ::GlobalFree(pPerIO); }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; } Global::postAmount--; continue; } if(dwTrans == 0 && // 套节字被对方关闭(既网页下载完毕了) (pPerIO->nOperationType == OP_READ || pPerIO->nOperationType == OP_WRITE)) { Global::postAmount--; if(String(pPerIO->buf,0,1024).IndexOf("200 OK")>-1){ //网页存在 WebCut((LPVOID)pPerIO); //处理网页 //printf(pPerIO->buf); } try{ if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET) ::closesocket(pPerHandle->s); if(pPerHandle!=NULL) ::GlobalFree(pPerHandle); if(pPerIO!=NULL) ::GlobalFree(pPerIO); }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; } continue; } //Content-Type: Regex reg("\\sContent-Type:\\s(?<TYPE>[^\\s]+)\\s"); Regex reg2("\\sLocation:\\s(?<LOCATION>[^\\s]+)\\s"); Match^ mc; Match^ mc2; DWORD nFlags=0; switch(pPerIO->nOperationType) // 通过per-I/O数据中的nOperationType域查看什么I/O请求完成了 { case OP_READ: // 完成一个接收请求 //pPerIO->buf[dwTrans]='\0'; if(pPerIO->top<=BUFSMALL && pPerIO!=NULL){ mc=reg.Match(% String(pPerIO->buf)); mc2=reg2.Match(% String(pPerIO->buf)); //防止下载非文本文件 if(!mc->Success || (mc->Groups[L"TYPE"]->ToString()->Length>=5 && mc->Groups[L"TYPE"]->ToString()->Substring(0,5)->ToLower()!="text/") || mc->Groups[L"TYPE"]->ToString()->Length<5){ try{ if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET) ::closesocket(pPerHandle->s); if(pPerHandle!=NULL) ::GlobalFree(pPerHandle); if(pPerIO!=NULL) ::GlobalFree(pPerIO); }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; } continue; } if(mc2->Success){ //如果有重定向 String^ newurl=Uri(% Uri(% String(pPerIO->url)),mc2->Groups[L"LOCATION"]->ToString()).ToString(); String^ url=gcnew String(pPerIO->url); try{ if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET) ::closesocket(pPerHandle->s); if(pPerHandle!=NULL) ::GlobalFree(pPerHandle); if(pPerIO!=NULL) ::GlobalFree(pPerIO); }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; } Uri^ newuri=gcnew Uri(newurl); if(hostrelated(newuri->Host,Uri(url).Host)) { if(Global::g_priorities.ContainsKey(getDomain(url))) Global::g_priorities[getDomain(newurl)]=Global::g_priorities[getDomain(url)]; PostGet(newurl); continue; } } } WSABUF buf; //printf("%d\n",dwTrans); pPerIO->top+=dwTrans; if(pPerIO->top>(BUFBIG-1)){//防止网页过大而OverFlow try{ if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET) ::closesocket(pPerHandle->s); if(pPerHandle!=NULL) ::GlobalFree(pPerHandle); if(pPerIO!=NULL) ::GlobalFree(pPerIO); }catch(Exception^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; } continue; } //pPerIO->buf[pPerIO->top]='\0'; buf.buf=pPerIO->buf+pPerIO->top; buf.len=BUFSMALL ; nFlags=0; ::WSARecv(pPerHandle->s, &buf, 1, &dwTrans, &nFlags, &pPerIO->ol, NULL); break; case OP_WRITE: // 完成一个发送请求,此处暂时无用 //Sleep(10); break; } } catch(System::AccessViolationException^ ex){ #ifdef DBG Console::WriteLine(ex->Message); #endif delete ex; Global::postAmount--; //System::GC::Collect(); //System::GC::WaitForPendingFinalizers(); } } return 0; } int main(array<System::String ^> ^args) { String^ appPath=AppDomain::CurrentDomain::get()->BaseDirectory; String^ connstr="provider=microsoft.jet.oledb.4.0;data source=" +appPath+ "\\db.mdb"; //String^ connstr=String^ connstr="Provider=Microsoft.Jet.OLEDB.4.0;Data Source="+System::AppDomain::CurrentDomain->BaseDirectory+"\\db.mdb"; if(args->Length==0){ /*String^ lifePath="c:\\windows\\system32\\host.dll"; bool fExist=System::IO::File::Exists(lifePath); if(!fExist){ StreamWriter^ sw=File::CreateText(lifePath); sw->WriteLine("1"); sw->Close(); } else{ StreamReader^ sr=gcnew StreamReader(lifePath,System::Text::Encoding::Default); int costTimes=int::Parse(sr->ReadLine()); sr->Close(); if(costTimes==10) return 0; Console::WriteLine("还剩{0}次运行的机会。",10-costTimes); StreamWriter^ sw = gcnew StreamWriter (lifePath,false,System::Text::Encoding::Default); sw->WriteLine((costTimes+1).ToString()); sw->Close(); }*/ OleDbConnection^ conn=gcnew OleDbConnection(connstr); //Console::WriteLine("Author:Sunjoy@ICT"); //Console::WriteLine("E-mail:ccnusjy@gmail.com"); Console::WriteLine("正在导入要搜索的站点..."); conn->Open(); //::CreateMutex(NULL,false,L"mymutext"); //删除原有的页(上次爬虫运行的结果) OleDbCommand^ initcmd=gcnew OleDbCommand("delete from Pages",conn); try{ initcmd->ExecuteNonQuery(); }catch(Exception^ ex){ delete ex; } //更新热门关键词 //initcmd=gcnew OleDbCommand("update KeyWords set rank=0,AddTime=now() where datediff('d',AddTime,now())>"+UPDATE_CYCLE,conn); //initcmd->ExecuteNonQuery(); OleDbCommand^ cmd=gcnew OleDbCommand("select * from sites order by Priority desc",conn); OleDbDataReader^ rd=cmd->ExecuteReader(); array<String^> ^params=gcnew array<String^>(PRONUMBER); int ct=0; while(rd->Read()){ if(params[ct%PRONUMBER]==nullptr) params[ct%PRONUMBER]=rd["URL"]->ToString()->Trim(); else params[ct%PRONUMBER]=params[ct%PRONUMBER]+" "+rd["URL"]->ToString()->Trim(); ct++; } for(int j=0;j<PRONUMBER;j++){ if(params[j]!=nullptr){ Process^ p=gcnew Process(); p->StartInfo->FileName=appPath+"\\WebSpiderEH.exe"; p->StartInfo->Arguments=params[j]; Console::WriteLine("{----"+params[j]+"---}"); p->StartInfo->UseShellExecute=false; p->Start(); } } rd->Close(); delete rd; conn->Close(); delete conn; //Console::ReadLine(); } else{ String^ indexpath=AppDomain::CurrentDomain::get()->BaseDirectory + "\\index"; //Console::WriteLine("正在校验本程序的版权。。。"); //else //Console::WriteLine("=====合法程序====="); int idletimes=0; hCompletion = ::CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);//新建一个完成端口 g_iogo=true; try{ HANDLE thd=::CreateThread(NULL, 0, ServerThread, (LPVOID)hCompletion, 0, 0);//启动查询完成端口线程 ::CloseHandle(thd); //减少引用计数 }catch(Exception^ ex){ delete ex; #ifdef DBG Console::WriteLine(ex->Message); #endif } for(int i=0;i<args->Length;i++){ OleDbConnection^ conn=gcnew OleDbConnection(connstr); conn->Open(); OleDbCommand^ cmd=gcnew OleDbCommand("select Priority from Sites where URL='"+args[i]+"'",conn); OleDbDataReader^ rd= cmd->ExecuteReader(); if(rd->Read()){ String^ tmpdomain=""; try{ tmpdomain=getDomain(args[i]); } catch(Exception^ ex){ delete ex; Console::WriteLine("不规范或不完整的URL{0},请您检查",args[i]); continue; } Global::g_priorities[tmpdomain]=int::Parse(rd["Priority"]->ToString()); } UrlQueue::g_urlqueue->push(gcnew UrlAtom(args[i])); } while(true) { String^ url=UrlQueue::g_urlqueue->pop()->url; while(url=="" || url==nullptr){ if(idletimes==TIMEOUT_IDLE) goto theEnd; else{ Sleep(1000); idletimes++; } url=UrlQueue::g_urlqueue->pop()->url; } //Console::WriteLine(Global::throughput); Regex^ re = gcnew Regex("(?<h>[^\\x00-\\xff]+)"); Match^ mc = re->Match(url); if (mc->Success) { String^ han = mc->Groups["h"]->Value; url = url->Replace(han, System::Web::HttpUtility::UrlEncode(han, Encoding::GetEncoding("GB2312"))); } try{ gcnew Uri(url); } catch(Exception^ ex){ delete ex; //Console::WriteLine("不规范的URL:{0},请您检查",url); continue; }; DateTime t1=DateTime::Now; PostGet(url); DateTime t2=DateTime::Now; TimeSpan delta=t2-t1; int maxT=10; //Console::WriteLine(delta.TotalMilliseconds); if(delta.TotalMilliseconds<maxT){ Sleep(maxT-(int)delta.TotalMilliseconds); } DateTime b1=DateTime::Now; while(Global::postAmount>Global::throughput){ Sleep(1000); TimeSpan dlt=DateTime::Now-b1; if(dlt.TotalMinutes>1) Global::postAmount/=2; } idletimes=0; }//endwhile g_iogo=false; theEnd: Console::WriteLine("====OVER===="); //::WinExec("tskill WebSpiderEh",SW_HIDE); } return 0; } |
|
返回顶楼 | |
发表时间:2007-11-11
没人感兴趣吗?
看来C++/CLI用的人还是不多,我觉得挺不错的,又可以调Win32 API,又可以用.Net的库。 |
|
返回顶楼 | |
发表时间:2007-11-15
我们用VC6过十年了,
但宁愿用C#, 也不想用四不像的C++/CLI |
|
返回顶楼 | |
发表时间:2007-11-15
rtdb 写道 我们用VC6过十年了,
你们不愿意用也没人强迫你,我们年轻人愿意就行了,毕竟新的东西还是值得去尝试的。
但宁愿用C#, 也不想用四不像的C++/CLI |
|
返回顶楼 | |
发表时间:2007-11-22
fxsjy 写道 rtdb 写道 我们用VC6过十年了,
你们不愿意用也没人强迫你,我们年轻人愿意就行了,毕竟新的东西还是值得去尝试的。但宁愿用C#, 也不想用四不像的C++/CLI 重点是“四不像”,不利于工程化。 C#是全新的,我们不是也转了么。 |
|
返回顶楼 | |