博主这一星期基本都在玩python爬虫,从豆瓣的图书排行到豆瓣的电影排行到链家的房屋信息到去哪儿网的旅游信息爬了个遍:,先贴一段爬取豆瓣图书前200排行的代码给小伙伴们:
#-*- coding:utf-8 -*-
import requests
from lxml import etree
import time
with open(r'D:/douban.txt','w',encoding='utf-8') as F:
for i in range(10):
url = 'https://book.douban.com/top250?start={}'.format(i*25)
data = requests.get(url).text
f = etree.HTML(data)
books = f.xpath('//*[@id="content"]/div/div[1]/div/table')
for div in books:
title = div.xpath('./tr/td[2]/div[1]/a/@title')[0]
score = div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
comment = div.xpath('./tr/td[2]/p[2]/span/text()')
num = div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip().strip(')')
href = div.xpath('./tr/td[2]/div[1]/a/@href')[0]
time.sleep(1) #加个睡眠,防止IP被封
if len(comment)>0:
print('{}-->{}-->{}-->{}-->{}'.format(title,score,comment[0],num,href))
F.write("{}-->{}-->{}-->{}-->{}\n".format(title,score,comment,num,href))
else:
print('{}-->{}-->{}-->{}'.format(title,score,num,href))
F.write("{}-->{}-->{}-->{}\n".format(title,score,num,href))
F.flu