Linux获取网页源码的几种方法

LinuxSocketChromeCC++
JavaEye博客还是本科做毕业设计时候开通的,基本上荒废了,现在决定记录下平时编程遇到的问题或者解决方案。

第一个为利用linux下的工具来获取网页源码,我用的是Wget,也可以使用Curl,curl的话更加的灵活,可以设置很多参数

C++代码
  1. //通过Wget来获取网页
  2. stringGetHtmlByWget(stringurl)
  3. {
  4. //获取待下载网页文件名
  5. stringfileName=url.substr((int)url.find_last_of("/")+1);
  6. if(fileName!="")
  7. {
  8. stringstrCom="wget-q";//wget命令,-q表示不显示下载信息
  9. strCom.append(url);
  10. system(strCom.c_str());//执行wget
  11. ifstreamfin(fileName.c_str());
  12. if(!fin)
  13. {
  14. return"";
  15. }
  16. stringstrHtml="";
  17. charchTemp[1024]="";
  18. //读取网页文件到内存中
  19. while(fin.getline(chTemp,1024))
  20. {
  21. strHtml.append(string(chTemp));
  22. strcpy(chTemp,"");
  23. }
  24. fin.close();
  25. strCom="rm-f";//删除文件命令,-f表示直接删除不做任何提示
  26. strCom.append(fileName);
  27. system(strCom.c_str());//删除刚才下载下来的文件
  28. returnstrHtml;//返回网页源码
  29. }
  30. else
  31. {
  32. return"";
  33. }
  34. }
//通过Wget来获取网页
string GetHtmlByWget(string url)
{
//获取待下载网页文件名
string fileName = url.substr((int)url.find_last_of("/") + 1);
if(fileName != "")
{
string strCom = "wget -q "; //wget命令,-q表示不显示下载信息
strCom.append(url);
system(strCom.c_str()); //执行wget
ifstream fin(fileName.c_str());
if(!fin)
{
return "";
}
string strHtml = "";
char chTemp[1024] = "";
//读取网页文件到内存中
while(fin.getline(chTemp , 1024))
{
strHtml.append(string(chTemp));
strcpy(chTemp , "");
}
fin.close();
strCom = "rm -f ";  //删除文件命令,-f表示直接删除不做任何提示
strCom.append(fileName);
system(strCom.c_str()); //删除刚才下载下来的文件
return strHtml; //返回网页源码
}
else
{
return "";
}
}

第二个是用的socket的来获取源码

C++代码
  1. //通过GET获取网页源码
  2. stringGetHtmlByGet(stringurl)
  3. {
  4. stringstrHtmlContent="";
  5. intsockfd;
  6. structsockaddr_inaddr;
  7. structhostent*pURL;
  8. chartext[RECVBUF];
  9. //分析链接
  10. UrlInfourlInfo=ParseURL(url);
  11. stringsAccept="Accept:*/*\r\nAccept-Language:zh-cn\r\nAccept-Encoding:gzip,deflate";
  12. //不同的主机UserAgent不同
  13. stringsUserAgent="Mozilla/5.0(X11;U;Linuxi686;en-US)AppleWebKit/534.10(KHTML,likeGecko)Chrome/8.0.552.224Safari/534.10";
  14. //将端口转换为字符串
  15. chart[6];
  16. stringstrPort;
  17. sprintf(t,"%d",urlInfo.Port);
  18. strPort=t;
  19. //构造发送字符串
  20. stringstrRequest="";
  21. strRequest.append("GET");
  22. strRequest.append(urlInfo.File);
  23. strRequest.append("?");
  24. strRequest.append(urlInfo.Body);
  25. strRequest.append("HTTP/1.1\r\n");
  26. strRequest.append(sAccept);
  27. strRequest.append("\r\nUser-Agent:");
  28. strRequest.append(sUserAgent);
  29. strRequest.append("\r\nHost:");
  30. strRequest.append(urlInfo.Host);
  31. strRequest.append(":");
  32. strRequest.append(strPort);
  33. strRequest.append("\r\nConnection:Keep-Alive\r\n\r\n");
  34. char*host=const_cast<char*>(urlInfo.Host.c_str());
  35. sockfd=socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);//TCP方式发送
  36. pURL=gethostbyname(host);
  37. addr.sin_family=AF_INET;
  38. addr.sin_addr.s_addr=*((unsignedlong*)pURL->h_addr);
  39. addr.sin_port=htons(80);
  40. //连接
  41. connect(sockfd,(structsockaddr*)&addr,sizeof(addr));
  42. //发送
  43. send(sockfd,const_cast<char*>(strRequest.c_str()),strRequest.length(),0);
  44. //接受
  45. while(recv(sockfd,text,RECVBUF,0)>0)
  46. {
  47. strHtmlContent.append(text);
  48. bzero(text,RECVBUF);
  49. }
  50. //关闭socket
  51. close(sockfd);
  52. //返回接受结果
  53. returnstrHtmlContent;
  54. }
//通过GET获取网页源码
string GetHtmlByGet(string url)
{
string strHtmlContent = "";
int sockfd;
struct sockaddr_in addr;
struct hostent *pURL;
char text[RECVBUF];
//分析链接
UrlInfo urlInfo = ParseURL(url);
string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate";
//不同的主机UserAgent不同
string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10";
//将端口转换为字符串
char t[6];
string  strPort;
sprintf(t,"%d", urlInfo.Port);
strPort = t;
//构造发送字符串
string strRequest = "";
strRequest.append("GET ");
strRequest.append(urlInfo.File);
strRequest.append("?");
strRequest.append(urlInfo.Body);
strRequest.append(" HTTP/1.1\r\n");
strRequest.append(sAccept);
strRequest.append("\r\nUser-Agent:");
strRequest.append(sUserAgent);
strRequest.append("\r\nHost:");
strRequest.append(urlInfo.Host);
strRequest.append(":");
strRequest.append(strPort);
strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n");
char* host = const_cast<char*>(urlInfo.Host.c_str());
sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式发送
pURL = gethostbyname(host);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr);
addr.sin_port = htons(80);
//连接
connect(sockfd,(struct sockaddr *)&addr,sizeof(addr));
//发送
send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0);
//接受
while(recv(sockfd, text, RECVBUF, 0) > 0)
{
strHtmlContent.append(text);
bzero(text,RECVBUF);
}
//关闭socket
close(sockfd);
//返回接受结果
return strHtmlContent;
}

使用libcurl

Java代码
  1. #include<stdio.h>
  2. #include<string.h>
  3. #include<curl/curl.h>
  4. #defineMAX_BUF65536
  5. charwr_buf[MAX_BUF+1];
  6. intwr_index;
  7. /*
  8. *Writedatacallbackfunction(calledwithinthecontextof
  9. *curl_easy_perform.
  10. */
  11. size_twrite_data(void*buffer,size_tsize,size_tnmemb,void*userp)
  12. {
  13. intsegsize=size*nmemb;
  14. /*Checktoseeifthisdataexceedsthesizeofourbuffer.Ifso,
  15. *settheuser-definedcontextvalueandreturn0toindicatea
  16. *problemtocurl.
  17. */
  18. if(wr_index+segsize>MAX_BUF){
  19. *(int*)userp=1;
  20. return0;
  21. }
  22. /*Copythedatafromthecurlbufferintoourbuffer*/
  23. memcpy((void*)&wr_buf[wr_index],buffer,(size_t)segsize);
  24. /*Updatethewriteindex*/
  25. wr_index+=segsize;
  26. /*Nullterminatethebuffer*/
  27. wr_buf[wr_index]=0;
  28. /*Returnthenumberofbytesreceived,indicatingtocurlthatallisokay*/
  29. returnsegsize;
  30. }
  31. /*
  32. *Simplecurlapplicationtoreadtheindex.htmlfilefromaWebsite.
  33. */
  34. intmain(void)
  35. {
  36. CURL*curl;
  37. CURLcoderet;
  38. intwr_error;
  39. wr_error=0;
  40. wr_index=0;
  41. /*Firststep,initcurl*/
  42. curl=curl_easy_init();
  43. if(!curl){
  44. printf("couldn'tinitcurl\n");
  45. return0;
  46. }
  47. /*TellcurltheURLofthefilewe'regoingtoretrieve*/
  48. curl_easy_setopt(curl,CURLOPT_URL,"www.exampledomain.com");
  49. /*Tellcurlthatwe'llreceivedatatothefunctionwrite_data,and
  50. *alsoprovideitwithacontextpointerforourerrorreturn.
  51. */
  52. curl_easy_setopt(curl,CURLOPT_WRITEDATA,(void*)&wr_error);
  53. curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,write_data);
  54. /*Allowcurltoperformtheaction*/
  55. ret=curl_easy_perform(curl);
  56. printf("ret=%d(write_error=%d)\n",ret,wr_error);
  57. /*Emitthepageifcurlindicatesthatnoerrorsoccurred*/
  58. if(ret==0)printf("%s\n",wr_buf);
  59. curl_easy_cleanup(curl);
  60. return0;
  61. }

发表回复