引用
from bs4 import BeautifulSoup
html=requests.get('https://www.cnblogs.com/cate/python/')
soup=BeautifulSoup(html.text,'lxml')
地址获取方法
items=soup.select('div[class="post_item_body"]')
for item in items:
import requests
from bs4 import BeautifulSoup
headers={
'User-Agent':'https://www.cnblogs.com/cate/python/',
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'cache-control':'max-age=0'
}
html=requests.get('https://www.cnblogs.com/cate/python/')
soup=BeautifulSoup(html.text,'lxml')
items=soup.select('div[class="post_item_body"]')
for item in items:
title=item.select('h3 a[class="titlelnk"]')[0].get_text()
href = item.select('h3 a[class="titlelnk"]')[0]['href']
author=item.select('div a[class="lightblue"]')[0].get_text()
author_home= item.select('div a[class="lightblue"]')[0]['href']
infos=item.select('p[class="post_item_summary"]')[0].get_text().strip('\n').strip(' ')
datas=item.select('div[class="post_item_foot"]')[0].get_text()
datas=datas.split(' ')
# ['\n随风奔跑的少年', '\r\n', '', '', '', '发布于', '2