4.清楚页面结构后不要急着爬取,到排首位的数据源网站,先把相关内容的标题(或文件名)和链接地址全部写入本地文件,再用正则表达式<说明>对标题(或文件名)进行过滤,示例如下:
# 过滤函数,含有禁用词的语句会被过滤,不含有必须词语的语句也会被过滤,
# 禁用词表和必须词表可以自己定义
def name_filter(rawfile,result_file):
# 读取待过滤文件的内容到列表
f = open(rawfile)
l = f.readlines()
f.close()
# 需要过滤的词语列表
noword = ["禁用词1","禁用词2","禁用词3",...]
uni_noword = []
# 生成禁用词unicode列表
for word in noword:
uni_word = strdecode(word)
uni_noword.append(uni_word)
# 必须含有的词语
yword = strdecode("必须词")
# 生成正则匹配模式
nopattern = re.compile('|'.join(uni_noword))
ypattern = re.compile(yword)
# 对文件中的记录进行过滤,包含"禁用词"和不包含"必须词"的
# 记录将被存入删除元素列表dele,然后从原始列表l中删除
dele = []
for item in l:
if re.findall(nopattern,strdecode(item)):
dele.append(item)
elif re.findall(ypattern,strdecode(item)) == []:
dele.append(item)
# 对l中的元素进行删除过滤
for delitem in dele:
if delitem in l:
l.remove(delitem)
# 将过滤之后的结果写到新的文件中
for line in l:
f = open(result_file)
f.write(line)
f.close()
二、细节:爬取中的一些常见问题和解决办法
2.爬虫会被服务器发现,会出现中断,网上有说使用代理或者更换ip等做法,我的建议是,调用sleep进行适当的暂停,也可以试试用Python的异常处理来进行解决,示例如下:
# 防止爬虫被阻断的示例,适度暂停(time.sleep(seconds)),循环(while),异常处理(try...except...)相结合
import time
def download(page):
# 定义下载函数
if __name__ == '__main__':
# total为爬取的页面总数,i用来计数
i = 0
rest = 1
while i< total:
try:
download(page = i)
i += 1
except:
time.sleep(rest)
if rest == 10:continue
else:
rest += 1
continue
3.对于文件的爬取,可以用urllib.urlretrieve,其语句简单,还可以尝试用requests模块,也可以尝试一下多线程,多线程可以用来做断点续传下载的,下面附上一段转载自http://www.jb51.net/article/44995.htm的python多线程下载示例:
# -*- coding: utf-8 -*-
# Author: ToughGuy
# Email: wj0630@gmail.com
# 写这玩意儿是为了初步了解下python的多线程机制
# 平时没写注释的习惯, 这次花时间在代码里面写上注释也是希望有问题的地方请各位指正, 因为可能我自己也没弄明白.
# 测试平台 Ubuntu 13.04 X86_64 Python 2.7.4
import threading
import urllib2
import sys
max_thread = 10
# 初始化锁
lock = threading.RLock()
class Downloader(threading.Thread):
def __init__(self, url, start_size, end_size, fobj, buffer):
self.url = url
self.buffer = buffer
self.start_size = start_size
self.end_size = end_size
self.fobj = fobj
threading.Thread.__init__(self)
def run(self):
"""
马甲而已
"""
with lock:
print 'starting: %s' % self.getName()
self._download()
def _download(self):
"""
我才是搬砖的
"""
req = urllib2.Request(self.url)
# 添加HTTP Header(RANGE)设置下载数据的范围
req.headers['Range'] = 'bytes=%s-%s' % (self.start_size, self.end_size)
f = urllib2.urlopen(req)
# 初始化当前线程文件对象偏移量
offset = self.start_size
while 1:
block = f.read(self.buffer)
# 当前线程数据获取完毕后则退出
if not block:
with lock:
print '%s done.' % self.getName()
break
# 写如数据的时候当然要锁住线程
# 使用 with lock 替代传统的 lock.acquire().....lock.release()
# 需要python >= 2.5
with lock:
sys.stdout.write('%s saveing block...' % self.getName())
# 设置文件对象偏移地址
self.fobj.seek(offset)
# 写入获取到的数据
self.fobj.write(block)
offset = offset + len(block)
sys.stdout.write('done.\n')
def main(url, thread=3, save_file='', buffer=1024):
# 最大线程数量不能超过max_thread
thread = thread if thread <= max_thread else max_thread
# 获取文件的大小
req = urllib2.urlopen(url)
size = int(req.info().getheaders('Content-Length')[0])
# 初始化文件对象
fobj = open(save_file, 'wb')
# 根据线程数量计算 每个线程负责的http Range 大小
avg_size, pad_size = divmod(size, thread)
plist = []
for i in xrange(thread):
start_size = i*avg_size
end_size = start_size + avg_size - 1
if i == thread - 1:
# 最后一个线程加上pad_size
end_size = end_size + pad_size + 1
t = Downloader(url, start_size, end_size, fobj, buffer)
plist.append(t)
# 开始搬砖
for t in plist:
t.start()
# 等待所有线程结束
for t in plist:
t.join()
# 结束当然记得关闭文件对象
fobj.close()
print 'Download completed!'
if __name__ == '__main__':
url = 'http://pg.jrj.com.cn/acc/CN_DISC/STOCK_NT/2016/02/17/600306_ls_1201982482.PDF'
main(url=url, thread=10, save_file='test.pdf', buffer=4096)
这段代码执行可能会跑出error[10054],因为多线程对一个网站大量的使用urlopen操作,所以会被那个网站认定为攻击行为。有时就不再允许下载。导致urlopen()后,request.read()一直卡死在那里。最后会抛出errno [10054]。
关于python的多线程转下面一段文字:
试试用Python写个死循环:
import threading, multiprocessing
def loop():
x = 0
while True:
x = x ^ 1
for i in range(multiprocessing.cpu_count()):
t = threading.Thread(target=loop)
t.start()
4.中文乱码,网上的解决办法很多,如添加文件头、设置系统默认编码等,我尝试过一些大多解决不了根本问题。对于python中的编码主要明白两个概念,Python内部编码和源文件(或源字符)编码,Python内部使用unicode编码。大概过程是这样:
<任意编码(unicode,utf-8,ANSI,gbk,gb2312等等)的字符>
输入
<Python>
内部转换
<任意编码转为unicode编码>
输出
<unicode编码>
出错是发生在输入或输出的时候,可能是有些Python模块接收不了非unicode的某种编码,也可能是要写入的文件不接收unicode编码,这时我们解决接收错误的办法就是把其他编码字符转换为unicode编码字符,解决输出写入错误的办法是把unicode编码转换为可接收的编码。我的解决办法来自jieba分词的源码"\Python2.7.6\Lib\site-packages\jieba\_compat.py",这种办法可以有效解决,我们可以将_compat文件重命名为"transcoding.py",并复制到\Python2.7.6\Lib目录下作为单独模块来使用,使用时在文件头导入即可(from transcoding import strdecode),下面附上_compat.py的源码:
# -*- coding: utf-8 -*-
import sys
PY2 = sys.version_info[0] == 2
default_encoding = sys.getfilesystemencoding()
if PY2:
text_type = unicode
string_types = (str, unicode)
iterkeys = lambda d: d.iterkeys()
itervalues = lambda d: d.itervalues()
iteritems = lambda d: d.iteritems()
else:
text_type = str
string_types = (str,)
xrange = range
iterkeys = lambda d: iter(d.keys())
itervalues = lambda d: iter(d.values())
iteritems = lambda d: iter(d.items())
def strdecode(sentence):
if not isinstance(sentence, text_type):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
return sentence
推荐一个上市公司公开文件、报告的网站:巨潮资讯网(http://www.cninfo.com.cn/cninfo-new/index),这是证监会指定的上市公司信息披露网站,数据较全和权威,这里缺失的信息可以到证监会官网和上交所官网、深交所官网查查,还是没找到,那就到各种金融经济数据库、大的金融经济网站和大的门户网站财经频道看看。
四、爬取某购物网站商品评论的小爬虫示例(写的粗糙,并且爬虫会被发现,服务器可能会对本机ip的访问进行限制)
# -*- coding: utf-8 -*-
# 功能:输入关键词commodity,并指定排序字段psort,返回的前n名信息
# 综合排序psort=0,价格从高到低psort=1,psort为1时由于经常没有评论会报错,
# 价格从低到高psort=2销量从高到低psort=3,评论数从高到低psort=4,新品psort=5
# 最后爬取前n名产品的评论信息到本地文件中,文件名为"Topn_产品名.txt"
# 评论信息包含的内容如下:总体评价,热门评价标签,用户评论
# 其中用户评论信息包含:
# 是否匿名,评论内容,评论日期,收货后第几天评论,是否使用移动终端,昵称,所购产品颜色,
# 所购产品规格,所购产品名称,有可能是下单时间,评分,被认为有用的次数,被认为无用的次数,
# 用户所用客户端代码,用户等级ID,用户等级名称,用户省份,用户注册时间
# Date:2016-03-19
import requests
import urllib
import re
import sys
import HTMLParser
import time
import random
import threading
from transcoding import strdecode
from bs4 import BeautifulSoup
# 随机模拟浏览器Request Headers的User-Agent
user_agent = ['Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.11 (KHTML, like Gecko)\
Chrome/23.0.1271.64 Safari/537.11','Mozilla/5.0 (Windows NT 6.1; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/47.0.2526.106 Safari/537.36','Mozilla/5.0 \
(Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0',"Mozilla/5.0\
(X11; Linux x86_64) AppleWebKit/537.17 (KHTML, like Gecko)\
Chrome/24.0.1312.56 Safari/537.17",'Mozilla/5.0\
(Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0']
# 获取排名前n的商品信息
def get_topN(commodity,sort):
params = urllib.urlencode({'keyword':commodity,'psort':sort,'page':'1','enc':'utf-8'})
url = 'http://search.jd.com/Search?%s' %params
print "\nThe search result page:\n\n" + url
html = requests.get(url).content
soup = BeautifulSoup(html,"html.parser")
resCount = ''
while not resCount:
resCount = soup.find_all('span',{'class':'num','id':'J_resCount'})[0].string
print "\nWe found about %s products for you" %resCount
topN = input("\nPlease Enter The number: ")
print "\nWe are finding the products for you, Please waiting...\n"
a = soup.find_all(target='_blank',title=re.compile('.+'),limit = topN)
product = []
for i in range(topN):
tag = a[i]
name = tag['title']
href = 'http:'+tag['href']
ID = re.findall('/(\d+).html',href)[0]
text = requests.get(href).content
commentVersion = re.findall("commentVersion:'(\d+)'",text)[0]
product.append((name,ID,commentVersion))
print "Top%s: %s\n%s" %(i+1,name,href)
flag = raw_input("\nDownload Them? (y/n): ")
if flag == 'y':
return product,topN
else:
sys.exit(0)
# 获取总体评价和热门标签
def summary_hot(Id,commentVersion):
url = 'http://club.jd.com/productpage/p-%s-s-0-t-3-p-0.html\
?callback=fetchJSON_comment98vv%s' %(Id,commentVersion)
text = requests.get(url).content
d = re.findall('({.+})',text)[0]
json = d.replace('null','"null"').replace('true','"true"').replace('false','"false"')
dic = eval(json)
# 总体评价
CommentSummary = dic['productCommentSummary']
commentCount = str(CommentSummary['commentCount']) # 评论总数
generalCount = str(CommentSummary['generalCount']) # 中评数
goodCount = str(CommentSummary['goodCount']) # 好评数
poorCount = str(CommentSummary['poorCount']) # 差评数
# 打分情况统计
score1Count = str(CommentSummary['score1Count'])
score2Count = str(CommentSummary['score2Count'])
score3Count = str(CommentSummary['score3Count'])
score4Count = str(CommentSummary['score4Count'])
score5Count = str(CommentSummary['score5Count'])
# 总体评价列表
summary = [commentCount,generalCount,goodCount,\
poorCount,score1Count,score2Count,\
score3Count,score4Count,score5Count]
# 热门标签列表
hotCommentTag = dic['hotCommentTagStatistics']
hotTag = []
if hotCommentTag:
for item in hotCommentTag:
count = str(item['count']) # 计数
tag = item['name'] # 标签内容
# [(tag,count),...]
hotTag.append([tag,count])
# 根据评论总数计算页数
total = (int(commentCount)/10)+1
print "\nDownloading the comment summary successfully!"+'\n'+\
"\nThere are %s comments,%s pages\n" %(commentCount,total)
return summary,hotTag,total
# 获取所有评论内容
def get_comment(topnum,Id,name,commentVersion,page):
url = 'http://club.jd.com/productpage/p-%s-s-0-t-3-p-%s.html\
?callback=fetchJSON_comment98vv%s' %(Id,page,commentVersion)
# 随机模拟浏览器Request Headers
agent = random.choice(user_agent)
header = {'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Host':'club.jd.com',
'Referer':'http://item.jd.com/%s.html' %Id,
'User-Agent':'%s' %agent}
# 防止爬虫中断,适度暂停
text = ''
second = 1
while not text:
if second == 10:second = 1
time.sleep(second)
text = requests.get(url,headers = header).content
second += 1
print "Top%s: Downloading page %s" %(topnum,int(page)+1)
d = re.findall('({.+})',text)[0]
json = d.replace('null','"null"').replace('true','"true"').replace('false','"false"')
dic = eval(json)
# 评论列表
comments = dic['comments']
comment_list = []
if comments:
for item in comments:
anonymousFlag = str(item['anonymousFlag']) # 1匿名,0实名
content = item['content'] # 评论内容
creationTime = item['creationTime'] # 评论日期
days = str(item['days']) # 收货后第几天评论
isMobile = item['isMobile'] # true,移动终端;false,非移动终端
nickname = item['nickname'] # 昵称
productColor = item['productColor'] # 所购产品颜色
productSize = item['productSize'] # 所购产品规格
#referenceName = item['referenceName'] # 所购产品名称
referenceTime = item['referenceTime'] # 有可能是下单时间
score = str(item['score']) # 评分
usefulVoteCount = str(item['usefulVoteCount']) # 被认为有用的次数
uselessVoteCount = str(item['uselessVoteCount']) # 被认为无用的次数
# 用户的客户端代码,不同的数字表示不同的客户端,如Android,iPhone,微信购物等等
# 购物网站iPhone客户端-2,购物网站Android客户端-4,微信购物-21,网页不显示-0.
userClient = str(item['userClient'])
userLevelId = item['userLevelId'] # 用户等级ID
userLevelName = item['userLevelName'] # 用户等级名称
userProvince = item['userProvince'] # 用户省份
userRegisterTime = item['userRegisterTime'] # 用户注册时间
record = [anonymousFlag,content,creationTime,days,isMobile,nickname,\
productColor,productSize,referenceTime,\
score,usefulVoteCount,uselessVoteCount,userClient,\
userLevelId,userLevelName,userProvince,userRegisterTime]
comment_list.append(record)
with open('./data/Top%s_%s.txt' %(topnum,name),'a+') as f:
content = '\t'.join(record)+'\n'
f.write(content)
# 下载排名第j的商品评论
def download(j):
split_line = '\n'+'='*110+'\n'
topnum = j+1
item = product[j]
name = re.sub(r'[\\/:*?"<>|]','_',item[0])
summary,hotTag,total = summary_hot(item[1],item[2])
summ = '\t'.join(summary)
hot = '\t'.join(sum(hotTag,[]))
content = summ+split_line+hot+split_line
with open('./data/Top%s_%s.txt' %(topnum,name),'a+') as f:
f.write(content)
# 防止爬虫被阻断,避免报[10006]网络错误
i = 0
rest = 1
while i<total:
try:
page = str(i)
get_comment(topnum,item[1],name,item[2],page)
i += 1
except:
time.sleep(rest)
if rest == 10:continue
else:
rest += 1
continue
if __name__ == '__main__':
keyword = raw_input("Please Enter The Key Word: ")
sort_info = strdecode("\n综合排序--0,价格从低到高--2,销量从高到低--3,评论数从高到低--4,新品--5.")
print sort_info
psort = raw_input("\nPlease Enter The Sort Field: ")
commodity = strdecode(keyword).encode('utf-8')
product,topN = get_topN(commodity,psort)
# 下载前n名商品就用n个线程
threads = []
for i in range(topN):
t = threading.Thread(target=download,args=(i,))
threads.append(t)
for t in threads:
t.start()
# 等待所有线程结束
for t in threads:
t.join()
print "Download completed!"
最后,由于写爬虫不多也不大,上文所写难免多有纰漏和错误,非常欢迎大家留言批评指正,或者有什么问题也可以留言互动交流,相互学习。