初学python爬虫的xpath,今天用它完成对豆瓣top250图书的爬取
话不多说,直接上代码,后面有解释。
import fake_useragent
import requests
from lxml import etree
if __name__ == '__main__':
url = 'https://book.douban.com/top250?start={}'
fp = open('./book-250.txt', 'w', encoding='utf8')
for i in range(0, 250, 25):
book_url = url.format(i)
res = requests.get(url=book_url, headers={
'User-Agent': fake_useragent.UserAgent().random
})
tree = etree.HTML(res.text)
# name = tree.xpath('//div[@class="pl2"]/a/text()')
# print(name)
li_names = tree.xpath('//*[@id="content"]/div/div[1]/div/table')
# print(li_names)
# break
for names in li_names:
book_name = names.xpath('.//div[@class="pl2"]/a/text()')[0]
print(book_name.strip())
fp.write(book_name.strip() + '\n')
# break
1、url的拼接,
注意这里的start ,每页25条数据,所以用for循环,将十页都加上。
2、寻得book_name的数据后,用strip()将每条数据的空格消除,
3、写入到txt文件时,+'\n'将数据一行行的写出。
结果为:
对上述进行行补充,爬取图书的所有内容:
并且成功修改了各种确实数据:
import requests
import time
import re
from lxml import etree
import os
# 爬取豆瓣图书250的details
if __name__ == '__main__':
url = 'https://book.douban.com/top250?start={}'
head = {
'User-Agent': fake_useragent.UserAgent().random
}
f = open('./book_details.txt', 'w', encoding='utf8')
for i in range(0, 250, 25):
book_url = url.format(i)
res = requests.get(url=book_url, headers=head)
tree = etree.HTML(res.text)
# if not os.path.exists('book_pic{}—{}'.format(i + 1, i + 25)):
# os.mkdir('book_pic{}—{}'.format(i + 1, i + 25))
table_details = tree.xpath('//div[@id="content"]/div/div[1]/div/table')
# 爬取图书的名字,图片,作者,出版社,评分,时间,价格,简介
for table in table_details:
book_name = table.xpath('.//div[@class="pl2"]/a/text()')[0].strip()
print(book_name)
book_pic_url = table.xpath('.//a[@class="nbg"]/img/@src')[0]
# print(book_pic_url)
pic_res = requests.get(url=book_pic_url, headers=head)
pic_res.encoding = 'utf8'
# f_pic = open('./book_pic{}—{}/'.format(i + 1, i + 25) + book_name + '.jpg', 'wb')
# f_pic.write(pic_res.content)
# f_pic.close()
print(book_pic_url)
book_info = table.xpath('.//p[@class="pl"]/text()')[0].split(' / ')
print(book_info)
book_authors = book_info[0:len(book_info) - 3]
book_publish = book_info[- 3].strip()
book_time = book_info[- 2].strip()
book_price = book_info[- 1].strip()
if len(book_authors) != 0:
book_author = book_authors[0]
print(book_author)
else:
continue
print(book_publish)
print(book_time)
print(book_price)
book_score = table.xpath('.//div[@class="star clearfix"]/span[2]/text()')[0].strip()
book_evaluates = table.xpath('.//div[@class="star clearfix"]/span[3]/text()')[0].strip()
print(book_score)
# print(type(book_evaluates))
# print(book_evaluates)
# print(len(book_evaluates))
# print(book_evaluates[1:len(book_evaluates) - 1])
book_evaluate = book_evaluates[1:len(book_evaluates) - 1].strip()
print(book_evaluate)
# m = re.match(r'(\()(.*)(\))', book_evaluates)
# print(m)
# book_evaluate = m.group(2).strip()
# print(book_evaluate)
# print(book_evaluate)
book_intro = table.xpath('.//p[@class="quote"]/span/text()')
# print(book_intro)
if len(book_intro) != 0:
book_intro = table.xpath('.//p[@class="quote"]/span/text()')[0]
print(book_intro)
else:
continue
# time.sleep(1)
f.write(
book_name + book_author + book_publish + book_time + book_price + book_score + book_evaluate +
book_intro[0] + '\n')
# break
f.close()