python2.7爬虫实例-Python 2.7.6 爬虫实例

weixin_37988176

于 2020-11-01 12:48:51 发布

阅读量80

点赞数

#coding: utf-8

importurllib2from bs4 importBeautifulSoupclassSplider:def __init__(self):

self.manager=Manager()

self.downloader=Download()

self.parser=Parse()

self.outputer=Output()defcraw_search_word(self, root_url):

count=0

self.manager.add_new_url(root_url)whileself.manager.has_new_url():try:if count >= 10000:break

print "正在加载第" + str(count) + "到" + str(count + 15) + "条数据"current_url=self.manager.get_new_url()

html_content=self.downloader.download(current_url)

new_url, data=self.parser.parse(root_url, html_content)

self.manager.add_new_url(new_url)#self.outputer.collect(data)

self.outputer.output(data)

count+= 15

excepturllib2.URLError, e:if hasattr(e, "reason"):print "craw faild, reason:" +e.reasonclassManager(object):def __init__(self):

self.new_urls=set()

self.old_urls=set()defadd_new_url(self, new_url):if new_url isNone:returnNoneelif new_url not in self.new_urls and new_url not inself.old_urls:

self.new_urls.add(new_url)defhas_new_url(self):return len(self.new_urls) !=0defget_new_url(self):

new_url=self.new_urls.pop()

self.old_urls.add(new_url)returnnew_urlclassDownload(object):defdownload(self, url):if url isNone:returnNone

headers={"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0","Cookie" : "Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1466075280; __utma=194070582.826403744.1466075281.1466075281.1466075281.1; __utmv=194070582.|2=User%20Type=Visitor=1; signin_redirect=http%3A%2F%2Fwww.jianshu.com%2Fsearch%3Fq%3D%25E7%2594%259F%25E6%25B4%25BB%26page%3D1%26type%3Dnote; _session_id=ajBLb3h5SDArK05NdDY2V0xyUTNpQ1ZCZjNOdEhvNUNicmY0b0NtMnVuUUdkRno2emEyaFNTT3pKWTVkb3ZKT1dvbTU2c3c0VGlGS0wvUExrVW1wbkg1cDZSUTFMVVprbTJ2aXhTcTdHN2lEdnhMRUNkM1FuaW1vdFpNTDFsQXgwQlNjUnVRczhPd2FQM2sveGJCbDVpQUVWN1ZPYW1paUpVakhDbFVPbEVNRWZzUXh5R1d0LzE2RkRnc0lJSHJEOWtnaVM1ZE1yMkt5VC90K2tkeGJQMlVOQnB1Rmx2TFpxamtDQnlSakxrS1lxS0hONXZnZEx0bDR5c2w4Mm5lMitESTBidWE4NTBGNldiZXVQSjhjTGNCeGFOUlpESk9lMlJUTDVibjNBUHdDeVEzMGNaRGlwYkg5bHhNeUxJUVF2N3hYb3p5QzVNTDB4dU4zODljdExnPT0tLU81TTZybUc3MC9BZkltRDBiTEsvU2c9PQ%3D%3D--096a8e4707e00b06b996e8722a58e25aa5117ee9; CNZZDATA1258679142=1544596149-1486533130-https%253A%252F%252Fwww.baidu.com%252F%7C1486561790; _ga=GA1.2.826403744.1466075281; _gat=1","Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}

content= ""

try:

request= urllib2.Request(url, headers =headers)

response=urllib2.urlopen(request)

content=response.read()excepturllib2.URLError, e:if hasattr(e, "reason") and hasattr(e, "code"):printe.codeprinte.reasonelse:print "请求失败"

returncontentclassParse(object):defget_new_data(self, root_url, ul):

data=set()

lis= ul.find_all("li", {"class" : "have-img"})for li inlis:

cont= li.find("div", {"class" : "content"})

title= cont.find("a", {"class" : "title"}).get_text()

title_url= root_url + cont.a["href"]

data.add((title, title_url))returndatadefget_new_url(self, root_url, ul):

lis= ul.find_all("li", {"class" : "have-img"})

data_category_id= ul["data-category-id"]#最后一个文章data-recommended-at －1

max_id = int(lis[-1]["data-recommended-at"]) - 1new_url= root_url + "?data_category_id=" + data_category_id + "&max_id=" +str(max_id)returnnew_urldefparse(self, root_url, content):

soup= BeautifulSoup(content, "html.parser", from_encoding="utf-8")

div= soup.find(id="list-container")

ul= div.find("ul", {"class" : "note-list"})

new_url=self.get_new_url(root_url, ul)

new_data=self.get_new_data(root_url, ul)returnnew_url, new_dataclassOutput(object):def __init__(self):

self.datas=set()defcollect(self, data):if data isNone:returnNonefor item indata:if item is None or item inself.datas:continueself.datas.add(item)defoutput(self, data):for item indata:

title, url=itemprint title + " " +urlif __name__ == "__main__":

root_url= "http://www.jianshu.com/recommendations/notes"splider=Splider()

splider.craw_search_word(root_url)

weixin_37988176

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python2.7爬虫实例-Python 2.7.6 爬虫实例

#coding: utf-8importurllib2from bs4 importBeautifulSoupclassSplider:def __init__(self):self.manager=Manager()self.downloader=Download()self.parser=Parse()self.outputer=Output()defcraw_search_word(self...
复制链接

扫一扫

python2.7爬虫实例-Python 2.7.6 爬虫实例

“相关推荐”对你有帮助么？