自己写的新浪博客下载器~~

最新推荐文章于 2025-09-13 09:00:00 发布

原创最新推荐文章于 2025-09-13 09:00:00 发布 · 1.5k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#list #download #delete #struct #null #buffer

博客备份工具专栏收录该内容

0 篇文章

订阅专栏

博主分享了自己的C++实现的新浪博客下载器，用于博客备份和自然语言处理的语料收集。目前代码质量不高，存在一些问题，如需要已有文件夹作为保存路径、下载数量限制等，期待社区提供改进意见。

啦啦啦，啦啦啦，我是卖报的小行家~

第一次发东西，小激动下~~

在网上看见不少博客备份工具，但是要么用的语言我没接触过，要么就是linux下的，要么就是……收钱的。于是自己动手写了这么个东西出来。从代码质量能看出来，我水平太低啦。不过勉强实现了预期的功能，但是还没有进行完完整的调试，存在的毛病多多，暂时可以用来进行自然语言处理的语料收集部分，要想达到真正的备份功能还需要进行改进。写在这里，就是想与大伙分享下自己的劳动成果，也希望大家对里面的问题给出些解决办法。

接下来会有源代码、使用方法、注意事项以及到目前为止我自己发现并有待改进的地方

对于后面提出的问题，希望大家能给点建议或者解释也行呀~~新手不容易~

想要分的话可以到这里来领：

http://topic.csdn.net/u/20120219/10/cdee9533-09cd-4e9f-b7d9-db03f379bd13.html

言归正传，这个所谓的下载器也不过是一段还没完整调试过的源码。编译环境VS2008. 所用语言，C++

不说闲的了，先看代码吧

资源下载地址：

http://download.csdn.net/detail/shoulder1102/4073285。

#include<windows.h>
#include<wininet.h>
#include<iostream>
#pragma comment(lib,"wininet.lib")
using namespace std;
#define SAVE_TYPE ".html"
#define BUFFSIZE  1000
#define STAR "http://blog.sina.com.cn/"

//网址列表结点，web用于保存网址
struct list_iterm {
	char web[ BUFFSIZE ];
	struct list_iterm *next;
} head_list[1000];
int num_of_head_list=0;

//调用此函数可以把指定网址page_to_download的指定页面下载到指定的地方save_locatiaon保存下来
//调用有误则打印出相应的问题，并返回0；否则返回1.
int Download_page (char * save_location, char *page_to_download)
{
	DWORD byteread=0;
	char buffer[100];
	
	memset(buffer,0,100);
	HINTERNET internetopen;
	internetopen=InternetOpen(L"Testing",INTERNET_OPEN_TYPE_PRECONFIG,NULL,NULL,0);
	if (internetopen==NULL){
		cout<<"Internet open failed!"<<endl;
		return 0;
	}
	HINTERNET internetopenurl;
	internetopenurl=InternetOpenUrlA(internetopen,LPCSTR(page_to_download),NULL,0,INTERNET_FLAG_RELOAD,0);
	if (internetopenurl==NULL){
		cout<<"Internet open url failed!"<<endl;
		InternetCloseHandle(internetopen);
		return 0;
	}

	BOOL hwrite;
	DWORD written;
	HANDLE createfile;
	createfile=CreateFileA(LPCSTR(save_location),GENERIC_WRITE,0,0,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,0);
	if (createfile==INVALID_HANDLE_VALUE){
		cout<<"Create File failed!"<<endl;
	    InternetCloseHandle(internetopenurl);
		return 0;
	}
	BOOL internetreadfile;
	while(1){
		internetreadfile=InternetReadFile(internetopenurl,buffer,sizeof(buffer),&byteread);
	   if(byteread==0)
			break;
	   hwrite=WriteFile(createfile,buffer,sizeof(buffer),&written,NULL);
	   if (hwrite==0){
			cout<<"Write to file failed!"<<endl;
	   }
	}
	CloseHandle(createfile);
	cout<<"Finished downloading!"<<endl;
	return 1;
}

//该函数生成用于保存文件的地址
void Creat_save_location ( char *save_location, int page ) {
	char location_buff [ BUFFSIZE ];
	sprintf(location_buff, "%s%d%s",save_location,page, SAVE_TYPE );
	strcpy( save_location, location_buff);
}

//链表插入函数，在头结点后插入
void Insert (struct list_iterm * iterm ) {
	struct list_iterm *new_node= new ( struct list_iterm );
	strcpy ( new_node->web, iterm->web );
	new_node->next=head_list [ num_of_head_list ].next;
	head_list [ num_of_head_list ].next =new_node;
}
//用于收集指定位置 location的文件中包含的所有链接，并保存在head_list链表中
void Collect_website ( char *location) {
    FILE *fp;
	char buf[10000];
	char *pre_position;
	char *check_position;
	char *lat_position;
	struct list_iterm *new_list_node = new ( list_iterm );
	fp=fopen( location,"rt");
	while(fgets(buf,9999,fp)!=NULL){
		pre_position = strstr ( buf, STAR );
		while(pre_position != NULL){
			lat_position = strstr( pre_position,SAVE_TYPE );
			if(lat_position ==NULL)
				break;
			int i=0;
			char web[BUFFSIZE];
			while( pre_position < lat_position ){
				web [i++]=*pre_position;
				pre_position ++;
			}
			web[i]='\0';
			strcat( web, SAVE_TYPE );
			strcpy ( new_list_node->web, web );
			new_list_node->next = NULL;
			Insert (new_list_node );
			//cout<<web<<endl;
			pre_position =strstr( lat_position,STAR );
			if(pre_position==NULL)
				break;
		}
	}
	fclose (fp);
	delete ( new_list_node );
}

//下载指定网页并收集其上全部链接，保存在head_list链表中
void Download_Colection ( char *location, char *page_to_deal) {
	/*下载该网页*/
	Download_page( location,page_to_deal);
	/*提取全部链接*/
	Collect_website ( location );
	num_of_head_list ++;
}
//用于初始化head_list的一条头结点（num_of_head_list为全局变量）
void Creat_new_list ( ) {
		strcpy (head_list[ num_of_head_list].web, head_list[ num_of_head_list - 1].next->web );
		head_list [ num_of_head_list ].next = NULL;
}

//调试用函数，可选
void Print() {
	FILE *fp;
	fp=freopen ( "E:\\result.txt", "w", stdout );
	int i=0;
	while (i<=num_of_head_list) {
		struct list_iterm *p;
		p=head_list[i].next ;
		while(p!=NULL) {
			cout<<p->web<<endl;
			p=p->next;
		}
		i ++;
		cout<< i <<"*******************************************************************************"<< i <<endl;
	}
	fclose (stdout);
	fclose(fp);
}
/*用于统计该行数据长度的众数，并将其返回。其统计结果为52，下文函数直接用的结果。
int Statics (int i ) { 
	int mode[2][100]={0};//第0行记载该数据的长度；第1行记载其对应出现的次数
	int num_of_mode=0;
	struct list_iterm *p;
	int len;
	bool inc = false ;
	p = head_list [ i ].next ;
	while ( p != NULL ) {
		len = strlen ( p->web );
		for (int a=0; a < num_of_mode; a ++) {
			if (len == mode[0][a]) {
				mode[1][a] ++ ;
				inc=true;
				p=p->next ;
				a=num_of_mode;
			}
		}
		if( ! inc ) {
			mode[0][num_of_mode]=len;
			mode[1][num_of_mode]=1;
			num_of_mode ++ ;
			p=p->next ;
		}
		inc=false;
	}
	int max=mode[1][0];
	int count=0;
	for (int a = 0; a < num_of_mode ; a ++ ) {
		if (mode[1][a] > max ) {
			max = mode[1][a];
			count = a;
		}
	}
	return mode[0][count];
}*/

//用于删除链表中指定结点
void Delete (struct list_iterm * node_to_delete, int i ) {
	if (node_to_delete == head_list [ i ].next ) {
		head_list [i].next  = node_to_delete->next ;
		delete ( node_to_delete );
		return ;
	}
	struct list_iterm *p;
	p = head_list [ i ].next;
	while ( p->next != node_to_delete ) {
		p = p->next  ;
	}
	p->next = node_to_delete->next ;
	delete ( node_to_delete );
	return ;
}


//用于将长度不符合众数的结点全部删去
void Delete_needless ( ) {
	int i;
	int len;
	struct list_iterm *p;
	for ( i=0; i<=num_of_head_list; i ++ ) {
		p=head_list [ i ].next;
		while (p != NULL ) {
			//len = Statics ( i );结果为52，为加快计算，直接采用结果
			if (  strlen ( p->web) != 52 ){
				struct list_iterm *q = head_list [ i ].next;
				while ( q != p ) {
					q = q->next ;
				}
				p = p->next ;
				Delete ( q ,i );
			}
			else
				p = p->next ;			
		}
	}
}
				
void main ( )
{
	FILE * web_list;
	char listname [BUFFSIZE];
	cout<<"please input the list-file name :"<<endl;
	cin>>listname;
	web_list = fopen ( listname, "rb" );
	/*打开输入的链接地址*/
	if(web_list == NULL ) {
		cout<<"open_web_list error!"<<endl;
	}
	/*按顺序读取文件中网址*/
	char page_to_download[1000];
	char save_location [ BUFFSIZE ] ;
	char location_copy [BUFFSIZE];
	while ( fgets ( page_to_download, 999, web_list ) != NULL) {
		strcpy ( head_list[ num_of_head_list ].web, page_to_download );
		head_list[ num_of_head_list ].next = NULL;
		cout<<"please input the location of pages to be downloaded: "<<endl;
		cin>>save_location;
		strcpy ( location_copy, save_location );
		Download_Colection ( save_location, page_to_download );
		Creat_new_list ();
		Download_Colection ( save_location, head_list [ num_of_head_list ].web );
		Creat_new_list ();
		while ( strcmp ( head_list[ num_of_head_list-2].web, head_list[ num_of_head_list-1 ].next->web )) {
			Download_Colection (save_location, head_list [ num_of_head_list ].web );
			Creat_new_list ();
		}	
		num_of_head_list --;
		Delete_needless();
		//Print();	//调试用。
		struct list_iterm *p;
		int page_num=0;
		for ( int i=0; i < num_of_head_list ; i ++ ) {
			p=head_list[ i ].next;
			while ( p != NULL ) {
				strcpy (save_location, location_copy );
				Creat_save_location ( save_location, page_num );
				page_num++;
				Download_page ( save_location, p->web );
				p=p->next ;
			}
		}
		num_of_head_list=0;
	}
}

这个个程序里混杂了不少C的函数。原因是自己之前一直用的是c，这次也不过是把c伪装了一下。

先说使用方法

先建立一个文件

比如E:\\list.txt

里面保存着需要下载的那个人的博客的博文目录的第一页

比如我这个文件里放的就是：

http://blog.sina.com.cn/s/articlelist_1197161814_0_1.html
http://blog.sina.com.cn/s/articlelist_1187986757_0_1.html
http://blog.sina.com.cn/s/articlelist_1256947091_0_1.html
http://blog.sina.com.cn/s/articlelist_1240901011_0_1.html
http://blog.sina.com.cn/s/articlelist_1260535950_0_1.html

注意：运行时最好把新浪相关的博客、微博之类的退出来（不是关掉，是退出）。不过貌似只是影响速度。

然后下一个需要输入的是下载的文件的保存地址

如 E:\\result\\a

注意，a后面不能有后缀，同时确保result这个文件夹存在。

如图