做了好几天,终于写出来了,以前没有想到过,用C++也可以爬取网页,经过这么多天的努力终于做好了,解决了乱码问题。
从中学到很多,小到一个函数的参数,达到如何使用一个函数。
还有C++中一直让人头疼的编码问题,unicode编码问题,研究了很多资料,又对Mutibytetowidechar和widechartomultibyte进行了重新的认识。
一个重要的关键是windows默认的是ANSI字符集,同时对HTML的格式进行了分析,以判断编码问题。
感觉那么多天的辛苦没有白费,付出有了收获。不过在此,真的感谢那些牛人,期间也参考了他们的代码。
代码:
#include <iostream>
#include <winsock2.h>
#include <cstring>
#include <fstream>
#pragma comment(lib,"ws2_32.lib")
using namespace std;
void getWebPage(char *url)
{
SOCKET sock;
WSADATA wsa;
struct sockaddr_in addrclient;
ofstream of;
WSAStartup(MAKEWORD(2,2),&wsa);
of.open("temp.txt");
if(!of)
{
cout<<"open fail!"<<endl;
return;
}
static char content[100000]="";
char myurl[256];
char host[256];
char dom[256];
char header[256];
char type[512];
char *p;
memset(myurl,'\0',256);
memset(host,'\0',256);
memset(dom,'\0',256);
memset(header,'\0',256);
memset(type,'\0',512);
char *purl=0;
struct hostent *phost;
sock=socket(PF_INET,SOCK_STREAM,IPPROTO_TCP);
strcpy(myurl,url);
for(purl=myurl;*purl!='/'&&purl!='\0';++purl);
if(int(purl-myurl)==strlen(myurl))
strcpy(host,"/");
else
strcpy(host,purl);
*purl='\0';
strcpy(dom,myurl);
cout<<dom<<endl; //输出域名
cout<<host<<endl; //输出地址
of<<dom<<endl;
of<<host<<endl;
phost=gethostbyname(dom);
addrclient.sin_family=AF_INET;
addrclient.sin_port=htons(80);
addrclient.sin_addr.S_un.S_addr=*((unsigned long *)phost->h_addr);
connect(sock,(struct sockaddr*)&addrclient,sizeof(addrclient));
strcat(header, "GET ");
strcat(header, host);
strcat(header, " HTTP/1.1\r\n");
strcat(header, "Host: ");
strcat(header, dom);
strcat(header, "\r\nConnection: Close\r\n\r\n");
send(sock,header,strlen(header),0);
recv(sock,type,512,0);
cout<<type<<endl;
of<<type;
p=strstr(type,"utf-8");
if(p)
{
memset(content,'\0',100000);
while(recv(sock,content,100000,0)>0)
{
int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
unsigned short * wszGBK = new unsigned short[len+1];
memset(wszGBK, 0, len * 2 + 2);
MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);
char *szGBK=new char[len + 1];
memset(szGBK, 0, len + 1);
WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
cout<<szGBK;
of<<szGBK;
strnset(content,'\0',100000);
delete []wszGBK;
delete [] szGBK;
}
}
else
{
memset(type,'\0',512);
recv(sock,type,512,0);
cout<<type;
of<<type;
p=strstr(type,"gb2312");
if(p)
{
while(recv(sock,content,100000,0))
{
cout<<content;
of<<content;
strnset(content,'\0',100000);
}
}
else
{
while(recv(sock,content,100000,0)>0)
{
int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
unsigned short * wszGBK = new unsigned short[len+1];
memset(wszGBK, 0, len * 2 + 2);
MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);
char *szGBK=new char[len + 1];
memset(szGBK, 0, len + 1);
WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
cout<<szGBK;
of<<szGBK;
strnset(content,'\0',100000);
delete []wszGBK;
delete [] szGBK;
}
}
}
closesocket(sock);
WSACleanup();
of.close();
cout<<endl;
}
int main()
{
char url[256];
cout<<"http://";
cin>>url;
getWebPage(url);
return 0;
}
对此,又对socket编程产生了兴趣,socket编程魅力无穷。