1:爬取豆瓣读书中作品的信息
import fake_useragent
import requests
from lxml import etree
url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
# 爬取豆瓣图书的信息
fp = open("./doubantushu.txt", "w+", encoding="utf-8")
head = {
"User-Agent": fake_useragent.UserAgent().random
}
response = requests.get(url, headers=head)
response_text = response.text
# 使用 etree.HTML(text) 将字符串格式的 html 片段解析成 html 文档
tree = etree.HTML(response_text)
# 获取所有的li表单
li_list = tree.xpath("//ul[@class='subject-list']/li")
# 循环遍历li,从中取出想要的数据取出
for url in li_list:
# 取出书名,该书名为a标签中的title属性
book_list = li.xpath(".//div[2]/h2/a/@title")[0]
writer_list = li.xpath(".//div[2]/div[1]/text()")[0]
# print(writer_list.replace("/", "")
# replace()方法返回的是更改后的原字符串的副本,原字符串并不受影响.strip()方法直接在原字符串上修改
#strip()方法可以在循环内用,replace()可以在最后写入时使用
# list_ = writer_list[0]
evaluation_list = li.xpath(".//div[2]/div[2]/span/text()")[0]
sketch_list = li.xpath(".//div[2]/p/text()")[0]
price_list = li.xpath(".//div[2]/div[3]/div[2]/span/a/text()")[0]
print(book_list[0], writer_list[0], evaluation_list[0], sketch_list[0],price_list[0])
fp.write(book_list.replace("\n", "").replace(" ", "")+","+writer_list.replace(" ", "").replace("\n", "").replace(" ", "")+","+evaluation_list.replace("\n", "").replace(" ", "")
+","+sketch_list.replace("\n", "").replace(" ", "")+","+price_list.replace("\n", "").replace(" ", "")+"\n")
fp.close()
2:爬取豆瓣读书中作品进一步的详细信息(作品主页的详细信息)
import fake_useragent
import requests
from lxml import etree
url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
# 爬取豆瓣图书的信息
fp = open("./doubantushu.txt2", "w+", encoding="utf-8")
head = {
"User-Agent": fake_useragent.UserAgent().random
}
response = requests.get(url, headers=head)
response_text = response.text
# 使用 etree.HTML(text) 将字符串格式的 html 片段解析成 html 文档
tree = etree.HTML(response_text)
# 更为详细的获取每本书的简介
url_list = tree.xpath("//ul[@class='subject-list']/li/div[2]//a/@href")
# 循环遍历li,从中取出想要的数据取出
for urll in url_list:
resp = requests.get(urll, headers=head)
resp_text = resp.text
tree1 = etree.HTML(resp_text)
book_list = tree1.xpath("//div[@id='wrapper']/h1/span/text()")[0]
director_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[1]/a/text()")[0]
publish_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[1]/text()")[0]
# 不是没本书都有出品方,当该书籍没有出品方时用此来做判断,使得程序不会报错
producer_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[2]/text()")[0] if \
tree1.xpath("//div[@class='subject clearfix']/div[2]/a[2]/text()")[0] else "0"
# 获取弟节点 /following-sibling::text()[1]
year_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
"and text()='出版年:']/following-sibling::text()[1]")[0].strip()
page_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
"and text()='页数:']/following-sibling::text()[1]")[0].strip()
price_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
"and text()='定价:']/following-sibling::text()[1]")[0].strip()
pool_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
"and text()='装帧:']/following-sibling::text()[1]")[0].strip()
ISBN_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
"and text()='ISBN:']/following-sibling::text()[1]")[0].strip()
series_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[3]/text()")[0]
print(book_list, director_list, publish_list, year_list, page_list, price_list, pool_list, series_list, ISBN_list)
fp.write(
book_list + "," + director_list + "," + publish_list + "," + year_list + "," + page_list + "," + price_list + "," + pool_list + "," + series_list + "," + ISBN_list)
# break
fp.close()