爬取豆瓣读书那些事

# -*- coding: utf-8 -*-
import re,os
import urllib2,csv
import time
import numpy as np
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

"""-------------------------------------
clock() -> floating point number
  该函数有两个功能,
  在第一次调用的时候,返回的是程序运行的实际时间;
  以第二次之后的调用,返回的是自第一次调用后,到这次调用的时间间隔
"""#-------------------------------------

begin_time = time.clock()
#Some User Agents
hds={'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}


def open_url(url):
	global hds
	request = urllib2.Request(url,headers=hds)
	html = urllib2.urlopen(request).read().decode('utf-8')
	#print html
	return html


def get_text(url,label):
    
	# global url
    csvFile = open(r"/media/alis/个人文件资料/Spider/douban_read/%s/book_info.csv"%label,'w+')  
    write = csv.writer(csvFile)
    try:
        
	    Html = open_url(url)
	    web = etree.HTML(Html)
	    num = web.xpath('//div [@class="paginator"]//a[last()-1]/text()')   
	    num = int(num[0]) + 1      # 每个链接最多多少页
	    print num
	    for i in range(1,num):
             index = (i-1)*20
            #index = (i-1)*20
             Url = url + '?start=%s&type=T'%index
		#print Url
             html2 = open_url(Url)
             web2 = etree.HTML(html2)
             url2 =  web2.xpath('//div[@ id="subject_list"]//h2//@href')
             for url_i in url2:
                 time.sleep(np.random.rand()*2)
                 print url_i
                 html3 = open_url(url_i)
                 web3 = etree.HTML(html3)
                 book_name =  web3.xpath('//div[@ id="wrapper"]/h1/span/text()')
                 author = web3.xpath('//div[@id="info"]/span[1]//a/text()')
                 book_info = web3.xpath('//*[@id="info"]/text()')
                 
                 content = []
                 for info_single in book_info:
                     #info_single = book_info[i]
                     match = re.findall(ur'^\s+$', info_single.decode('utf-8'))
                     if len(match) != 0:
                         continue
                     content.append(info_single.encode('utf-8'))
                 Content = "".join(content)
                 
                 evaluate = web3.xpath('//*[@class="rating_self clearfix"]/strong/text()')
                 e_people = web3.xpath('//*[@class="rating_sum"]//a/span/text()')
                 #print book_name[0],author[0],Content,evaluate[0],e_people[0]
                 try:
                     write.writerow((book_name[0],author[0],Content,evaluate[0],e_people[0]))
                 except:
                     continue
    except Exception,e:
        	print Exception,":",e
    csvFile.close()            

       
url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'


def get_all_href(url):
    html = open_url(url)
    Html = etree.HTML(html)
    hrefs = Html.xpath('//div[@ class=""]//@href')
    for href in hrefs:
        label = href.split('/')[2]
        os.mkdir('%s'%label)      # 创建文件夹
        Href = 'https://book.douban.com' + href
        get_text(Href,label)
        
       
get_all_href(url)        
end_time = time.clock()
print end_time-begin_time


1.点进首页,选择所有标签,采集145个不同种类的书籍链接

2.对于每种书籍的最大页数不尽相同,于是爬取每种类型的最大页数,然后发现每页的规律是Url =url'?start=%s&type=T'%index​,其中index=(i-1)*20.

3.再次探索发现,当最大页数大于50时,后面的页面全为空,且在有些书籍详情页(最底层)的结构也不尽相同,匹配为空,故本文采取忽略这些书籍的做法,在保存书籍信息时使用try,except.

4.创建不同label文件夹命令​  os.mkdir('../(路径)/%s'%s)

5.操作csv文件的命令   :

import csv

csvFile = ​open('book_info.csv','w')

write = csv.writer(csvFile)

write.writerow((数据))

6.最坑的一点就是当提取数据具体信息时,会有不同程度的空格,于是对列表使用正则,提取没有空格的信息,代码如下:

content = []
for info_single in book_info:
                     #info_single = book_info[i]
            match = re.findall(ur'^\s+$', info_single.decode('utf-8'))
             if len(match) != 0:
                         continue
             content.append(info_single.encode('utf-8'))
 Content = "".join(content)


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值