爬虫小练习
import requests as req
from bs4 import BeautifulSoup
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
resp = req.get("https://book.douban.com/tag/小说", headers=header)
# requests可以根据页面的内容自动选择合适的编码
content = resp.text
bs = BeautifulSoup(content)
print(bs)
ls = bs.select(".subject-item")
for it in ls:
img = it.select_one("img")['src']
print("图片:", img)
a = it.h2.a
print(a['title'],a['href'])
txt = it.select_one(".pub").text.strip()
print(txt)
score = it.select_one('.rating_nums').text
print(score)
import re
txt2 = it.select_one('.pl').text.strip()
txt3 = re.findall("\d+",txt2)[0]
print(txt3)
print(it.p.text)
爬取结果
