# -*- coding: utf-8 -*-
import re,os
import urllib2,csv
import time
import numpy as np
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
"""-------------------------------------
clock() -> floating point number
该函数有两个功能,
在第一次调用的时候,返回的是程序运行的实际时间;
以第二次之后的调用,返回的是自第一次调用后,到这次调用的时间间隔
"""#-------------------------------------
begin_time = time.clock()
#Some User Agents
hds={'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
def open_url(url):
global hds
request = urllib2.Request(url,headers=hds)
html = urllib2.urlopen(request).read().decode('utf-8')
#print html
return html
def get_text(url,label):
# global url
csvFile = open(r"/media/alis/个人文件资料/Spider/douban_read/%s/book_info.csv"%label,'w+')
write = csv.writer(csvFile)
try:
Html = open_url(url)
web = etree.HTML(Html)
num = web.xpath('//div [@class="paginator"]//a[last()-1]/text()')
num = int(num[0]) + 1 # 每个链接最多多少页
print num
for i in range(1,num):
index = (i-1)*20
#index = (i-1)*20
Url = url + '?start=%s&type=T'%index
#print Url
html2 = open_url(Url)
web2 = etree.HTML(html2)
url2 = web2.xpath('//div[@ id="subject_list"]//h2//@href')
for url_i in url2:
time.sleep(np.random.rand()*2)
print url_i
html3 = open_url(url_i)
web3 = etree.HTML(html3)
book_name = web3.xpath('//div[@ id="wrapper"]/h1/span/text()')
author = web3.xpath('//div[@id="info"]/span[1]//a/text()')
book_info = web3.xpath('//*[@id="info"]/text()')
content = []
for info_single in book_info:
#info_single = book_info[i]
match = re.findall(ur'^\s+$', info_single.decode('utf-8'))
if len(match) != 0:
continue
content.append(info_single.encode('utf-8'))
Content = "".join(content)
evaluate = web3.xpath('//*[@class="rating_self clearfix"]/strong/text()')
e_people = web3.xpath('//*[@class="rating_sum"]//a/span/text()')
#print book_name[0],author[0],Content,evaluate[0],e_people[0]
try:
write.writerow((book_name[0],author[0],Content,evaluate[0],e_people[0]))
except:
continue
except Exception,e:
print Exception,":",e
csvFile.close()
url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
def get_all_href(url):
html = open_url(url)
Html = etree.HTML(html)
hrefs = Html.xpath('//div[@ class=""]//@href')
for href in hrefs:
label = href.split('/')[2]
os.mkdir('%s'%label) # 创建文件夹
Href = 'https://book.douban.com' + href
get_text(Href,label)
get_all_href(url)
end_time = time.clock()
print end_time-begin_time
2.对于每种书籍的最大页数不尽相同,于是爬取每种类型的最大页数,然后发现每页的规律是Url =url'?start=%s&type=T'%index,其中index=(i-1)*20.
3.再次探索发现,当最大页数大于50时,后面的页面全为空,且在有些书籍详情页(最底层)的结构也不尽相同,匹配为空,故本文采取忽略这些书籍的做法,在保存书籍信息时使用try,except.
4.创建不同label文件夹命令 os.mkdir('../(路径)/%s'%s)
5.操作csv文件的命令 :
import csv
csvFile = open('book_info.csv','w')
write = csv.writer(csvFile)
write.writerow((数据))
6.最坑的一点就是当提取数据具体信息时,会有不同程度的空格,于是对列表使用正则,提取没有空格的信息,代码如下:
content = []
for info_single in book_info:
#info_single = book_info[i]
match = re.findall(ur'^\s+$', info_single.decode('utf-8'))
if len(match) != 0:
continue
content.append(info_single.encode('utf-8'))
Content = "".join(content)