爬虫代码——爬取简书首页前10文章
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
# 配置 User-Agent 并创建 Request 对象
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'}
req = Request(url='https://www.jianshu.com/', headers=headers)
# 爬取网页
bs = BeautifulSoup(urlopen(req).read(), 'html.parser')
all_article = bs.find_all('a', {'class': 'title', 'target': '_blank'})
# 显示数据
for article in all_article:
print('=====文章=====')
print('标题:{}'.format(article.get_text()))
print('链接: {}'.format('https://www.jianshu.com'+article.attrs['href']))
print(article.parent.p.get_text()[7:]) # 每个描述字符串前都有 7 个空 格,需要截断
print('============\n')
652

被折叠的 条评论
为什么被折叠?



