Linux获取网页源码的几种方法
LinuxSocketChromeCC++
JavaEye博客还是本科做毕业设计时候开通的,基本上荒废了,现在决定记录下平时编程遇到的问题或者解决方案。
第一个为利用linux下的工具来获取网页源码,我用的是Wget,也可以使用Curl,curl的话更加的灵活,可以设置很多参数
- //通过Wget来获取网页
- stringGetHtmlByWget(stringurl)
- {
- //获取待下载网页文件名
- stringfileName=url.substr((int)url.find_last_of("/")+1);
- if(fileName!="")
- {
- stringstrCom="wget-q";//wget命令,-q表示不显示下载信息
- strCom.append(url);
- system(strCom.c_str());//执行wget
- ifstreamfin(fileName.c_str());
- if(!fin)
- {
- return"";
- }
- stringstrHtml="";
- charchTemp[1024]="";
- //读取网页文件到内存中
- while(fin.getline(chTemp,1024))
- {
- strHtml.append(string(chTemp));
- strcpy(chTemp,"");
- }
- fin.close();
- strCom="rm-f";//删除文件命令,-f表示直接删除不做任何提示
- strCom.append(fileName);
- system(strCom.c_str());//删除刚才下载下来的文件
- returnstrHtml;//返回网页源码
- }
- else
- {
- return"";
- }
- }
//通过Wget来获取网页 string GetHtmlByWget(string url) { //获取待下载网页文件名 string fileName = url.substr((int)url.find_last_of("/") + 1); if(fileName != "") { string strCom = "wget -q "; //wget命令,-q表示不显示下载信息 strCom.append(url); system(strCom.c_str()); //执行wget ifstream fin(fileName.c_str()); if(!fin) { return ""; } string strHtml = ""; char chTemp[1024] = ""; //读取网页文件到内存中 while(fin.getline(chTemp , 1024)) { strHtml.append(string(chTemp)); strcpy(chTemp , ""); } fin.close(); strCom = "rm -f "; //删除文件命令,-f表示直接删除不做任何提示 strCom.append(fileName); system(strCom.c_str()); //删除刚才下载下来的文件 return strHtml; //返回网页源码 } else { return ""; } }
第二个是用的socket的来获取源码
- //通过GET获取网页源码
- stringGetHtmlByGet(stringurl)
- {
- stringstrHtmlContent="";
- intsockfd;
- structsockaddr_inaddr;
- structhostent*pURL;
- chartext[RECVBUF];
- //分析链接
- UrlInfourlInfo=ParseURL(url);
- stringsAccept="Accept:*/*\r\nAccept-Language:zh-cn\r\nAccept-Encoding:gzip,deflate";
- //不同的主机UserAgent不同
- stringsUserAgent="Mozilla/5.0(X11;U;Linuxi686;en-US)AppleWebKit/534.10(KHTML,likeGecko)Chrome/8.0.552.224Safari/534.10";
- //将端口转换为字符串
- chart[6];
- stringstrPort;
- sprintf(t,"%d",urlInfo.Port);
- strPort=t;
- //构造发送字符串
- stringstrRequest="";
- strRequest.append("GET");
- strRequest.append(urlInfo.File);
- strRequest.append("?");
- strRequest.append(urlInfo.Body);
- strRequest.append("HTTP/1.1\r\n");
- strRequest.append(sAccept);
- strRequest.append("\r\nUser-Agent:");
- strRequest.append(sUserAgent);
- strRequest.append("\r\nHost:");
- strRequest.append(urlInfo.Host);
- strRequest.append(":");
- strRequest.append(strPort);
- strRequest.append("\r\nConnection:Keep-Alive\r\n\r\n");
- char*host=const_cast<char*>(urlInfo.Host.c_str());
- sockfd=socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);//TCP方式发送
- pURL=gethostbyname(host);
- addr.sin_family=AF_INET;
- addr.sin_addr.s_addr=*((unsignedlong*)pURL->h_addr);
- addr.sin_port=htons(80);
- //连接
- connect(sockfd,(structsockaddr*)&addr,sizeof(addr));
- //发送
- send(sockfd,const_cast<char*>(strRequest.c_str()),strRequest.length(),0);
- //接受
- while(recv(sockfd,text,RECVBUF,0)>0)
- {
- strHtmlContent.append(text);
- bzero(text,RECVBUF);
- }
- //关闭socket
- close(sockfd);
- //返回接受结果
- returnstrHtmlContent;
- }
//通过GET获取网页源码 string GetHtmlByGet(string url) { string strHtmlContent = ""; int sockfd; struct sockaddr_in addr; struct hostent *pURL; char text[RECVBUF]; //分析链接 UrlInfo urlInfo = ParseURL(url); string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate"; //不同的主机UserAgent不同 string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10"; //将端口转换为字符串 char t[6]; string strPort; sprintf(t,"%d", urlInfo.Port); strPort = t; //构造发送字符串 string strRequest = ""; strRequest.append("GET "); strRequest.append(urlInfo.File); strRequest.append("?"); strRequest.append(urlInfo.Body); strRequest.append(" HTTP/1.1\r\n"); strRequest.append(sAccept); strRequest.append("\r\nUser-Agent:"); strRequest.append(sUserAgent); strRequest.append("\r\nHost:"); strRequest.append(urlInfo.Host); strRequest.append(":"); strRequest.append(strPort); strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n"); char* host = const_cast<char*>(urlInfo.Host.c_str()); sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式发送 pURL = gethostbyname(host); addr.sin_family = AF_INET; addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr); addr.sin_port = htons(80); //连接 connect(sockfd,(struct sockaddr *)&addr,sizeof(addr)); //发送 send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0); //接受 while(recv(sockfd, text, RECVBUF, 0) > 0) { strHtmlContent.append(text); bzero(text,RECVBUF); } //关闭socket close(sockfd); //返回接受结果 return strHtmlContent; }
使用libcurl
- #include<stdio.h>
- #include<string.h>
- #include<curl/curl.h>
- #defineMAX_BUF65536
- charwr_buf[MAX_BUF+1];
- intwr_index;
- /*
- *Writedatacallbackfunction(calledwithinthecontextof
- *curl_easy_perform.
- */
- size_twrite_data(void*buffer,size_tsize,size_tnmemb,void*userp)
- {
- intsegsize=size*nmemb;
- /*Checktoseeifthisdataexceedsthesizeofourbuffer.Ifso,
- *settheuser-definedcontextvalueandreturn0toindicatea
- *problemtocurl.
- */
- if(wr_index+segsize>MAX_BUF){
- *(int*)userp=1;
- return0;
- }
- /*Copythedatafromthecurlbufferintoourbuffer*/
- memcpy((void*)&wr_buf[wr_index],buffer,(size_t)segsize);
- /*Updatethewriteindex*/
- wr_index+=segsize;
- /*Nullterminatethebuffer*/
- wr_buf[wr_index]=0;
- /*Returnthenumberofbytesreceived,indicatingtocurlthatallisokay*/
- returnsegsize;
- }
- /*
- *Simplecurlapplicationtoreadtheindex.htmlfilefromaWebsite.
- */
- intmain(void)
- {
- CURL*curl;
- CURLcoderet;
- intwr_error;
- wr_error=0;
- wr_index=0;
- /*Firststep,initcurl*/
- curl=curl_easy_init();
- if(!curl){
- printf("couldn'tinitcurl\n");
- return0;
- }
- /*TellcurltheURLofthefilewe'regoingtoretrieve*/
- curl_easy_setopt(curl,CURLOPT_URL,"www.exampledomain.com");
- /*Tellcurlthatwe'llreceivedatatothefunctionwrite_data,and
- *alsoprovideitwithacontextpointerforourerrorreturn.
- */
- curl_easy_setopt(curl,CURLOPT_WRITEDATA,(void*)&wr_error);
- curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,write_data);
- /*Allowcurltoperformtheaction*/
- ret=curl_easy_perform(curl);
- printf("ret=%d(write_error=%d)\n",ret,wr_error);
- /*Emitthepageifcurlindicatesthatnoerrorsoccurred*/
- if(ret==0)printf("%s\n",wr_buf);
- curl_easy_cleanup(curl);
- return0;
- }