爬取豆瓣网络思路:
- 从标签页进入,提取所有标签URL
- 进入每个标签页,提取所有列表URL
- 进入每个列表页,提取每一页的详情URL和下一页列表URL
- 进入每个详情页,拿到书名
- 如此往复循环,直到数据抓取完毕
# coding: utf8 from gevent import monkey monkey.patch_all() #用于随机获取请求头,用法random.choice(list),会从列表中随机取出一个元素 import random #用法:urljoin(base_url,result),会产生新的url,url=base_url+result from urlparse import urljoin import requests #用于解析所获取的html,获取自己想要的信息 from lxml import etree #由于下载网页都是阻塞在网络IO上利用多线程,提高效率 from gevent.pool import Pool #利用Queue建立线程间的通信 from gevent.queue import Queue #此处的base_url 就是要和我们将要获取的result 利用urljoin方法组成新的url base_url = 'https://book.douban.com' # 种子URL start_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' # 解析规则 rules = { # 提取标签页列表的规则 'list_urls': "//table[@class='tagCol']/tbody/tr/td/a/@href", # 提取详情页列表的规则 'detail_urls': "//li[@class='subject-item']/div[@class='info']/h2/a/@href", # 提取页码的规则 'page_urls': "//div[@id='subject_list']/div[@class='paginator']/a/@href", # 提取书名的规则 'title': "//div[@id='wrapper']/h1/span/text()", } # 定义队列 #用来建立提取标签页列表的通道 list_queue = Queue() #用来建立提取详情页列表的通道 detail_queue = Queue() # 定义协程池 pool = Pool(size=10) # 定义UserAgent user_agent_list = [ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko', ] def fetch(url): """发起HTTP请求,获取""" user_agent = random.choice(user_agent_list) headers = {'User-Agent': user_agent} html = requests.get(url, headers=headers).text return html def parse(html, rule): """解析页面""" return etree.HTML(html).xpath(rule) def crawl(url): """首页""" html = fetch(url) list_urls = parse(html, rules['list_urls']) for list_url in list_urls: #把获取来的list_url也就是上述的result,组合成新的url,并上传到Queue list_queue.put(urljoin(base_url, list_url)) def list_loop(): """采集列表页""" while True: #从Queue中获取上传的信息 list_url = list_queue.get() #协程参数一为方法,参数二为方法的参数,也就是不停调用crawl_list_page(list_url) pool.spawn(crawl_list_page, list_url) def detail_loop(): """采集详情页""" while True: detail_url = detail_queue.get() pool.spawn(crawl_detail_page, detail_url) def crawl_list_page(list_url): """采集列表页""" html = fetch(list_url) detail_urls = parse(html, rules['detail_urls']) # 详情页 for detail_url in detail_urls: detail_queue.put(urljoin(base_url, detail_url)) # 下一页 list_urls = parse(html, rules['page_urls']) for list_url in list_urls: list_queue.put(urljoin(base_url, list_url)) def crawl_detail_page(list_url): """采集详情页""" html = fetch(list_url) title = parse(html, rules['title'])[0] print title def run(): """run""" # 1. 首页 crawl(start_url) # 2. 列表页 pool.spawn(list_loop) # 3. 详情页 pool.spawn(detail_loop) # 开始采集 pool.join() if __name__ == '__main__': run()
运行结果:
C:\Python27\python.exe "C:/study wang/spider/spiderdome.py"
围城
活着
平凡的世界(全三部)
沉默的大多数
文学回忆录(全2册)
送你一颗子弹
白鹿原
繁花
北鸢
黄金时代