看到一个抓取新闻的爬虫,原文地址:http://www.lining0806.com/%E7%BD%91%E6%98%93%E6%96%B0%E9%97%BB%E6%8E%92%E8%A1%8C%E6%A6%9C%E6%8A%93%E5%8F%96%E5%9B%9E%E9%A1%BE/
修改了下加入了一点改进,重要的是:能在python3下运行啦~
附上源码:
# -*- coding:utf-8 -*- import os,time import sys import urllib from urllib import request import re from lxml import etree def StringListSave(save_path, filename, slist): if not os.path.exists(save_path): os.makedirs(save_path) path = save_path+"/"+filename+".txt" with open(path, "w+", encoding='GB18030') as fp: for s in slist: fp.write("%s\t\t%s\n" % (s[0], s[1])) def CellPage(save_path, filename, slist): '''单个新闻内容的存储''' folder = save_path+'/'+filename print (folder) if not os.path.exists(folder): os.mkdir(folder) i = 0 for item, url in slist: #设置每个频道保存多少条 if i >= 50:break #过滤不符合windows的文件名 newitem = re.sub(r"[\/\\\:\*\?\"\<\>\|]","",item) print (item) with open(folder+'/'+newitem+'.html', "w+", encoding='GB18030') as fp: PageContent = request.urlopen(url).read().decode("GB18030") fp.write("%s\n" % PageContent) i += 1 def Page_Info(myPage): '''Regex''' mypage_Info = re.findall(r'<div class="titleBar" id=".*?"><h2>(.*?)</h2><div class="more"><a href="(.*?)">.*?</a></div></div>', myPage, re.S) return mypage_Info def New_Page_Info(new_page): '''Regex(slowly) or Xpath(fast)''' dom = etree.HTML(new_page) new_items = dom.xpath('//tr/td/a/text()') new_urls = dom.xpath('//tr/td/a/@href') assert(len(new_items) == len(new_urls)) return zip(new_items, new_urls) def Spider(url): i = 0 print ("downloading ", url) myPage = request.urlopen(url).read().decode("GB18030") myPageResults = Page_Info(myPage) ntime = time.strftime("%Y%m%d",time.localtime(time.time())) save_path = "news-" + ntime filename = str(i)+"_"+u"Ranking" StringListSave(save_path, filename, myPageResults) i += 1 for item, url in myPageResults: print ("downloading ", url) new_page = request.urlopen(url).read().decode("GB18030") newPageResults = New_Page_Info(new_page) filename = str(i)+"_"+item StringListSave(save_path, filename, newPageResults) newPageResults = New_Page_Info(new_page) CellPage(save_path, filename, newPageResults) i += 1 if __name__ == '__main__': print ("start") start_url = "http://news.163.com/rank/" Spider(start_url) print ("end")
然后把它打包成exe文件,这次使用的是Pyinstaller
1. http://www.pyinstaller.org/ 在官网下载Pyinstaller
2. pyinstall 依赖一些windows组件,需要在http://sourceforge.net/projects/pywin32/ 下载相应版本,如pywin32-220.win32-py3.5.exe。如果出现“Unable to find vcvarsall.bat ”,可以直接下载编译好的http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml
3. 命令窗进入pyinstaller目录,运行
python pyinstaller.py --console --onefile hello.py
4. 文件就打包好了
5. 运行exe文件的时候有时候因为环境问题运行不起来,比如报以下错误:
只需要安装下VC2015即可。VC下载地址:http://pan.baidu.com/s/1o66GMLk