小白学习爬虫:爬取财经
放码过来
import requests
from bs4 import BeautifulSoup
url='https://finance.sina.com.cn/'
html=requests.get(url)
html.encoding='utf8'
soup=BeautifulSoup(html.text,'lxml')
lis=soup.select('.m-p1-m-blk2 .m-p1-mb2-list.m-list-container ul li a ')
for li in lis:
title=li.text
innerUrl=li['href']
if innerUrl.endswith('shtml') and len(title)>3:
print(title,innerUrl)
html = requests.get(innerUrl)
html.encoding='utf8'
soup = BeautifulSoup(html.text, 'lxml')
result = soup.select('.article p')
res = ''
for r in result:
res += r.text
print('新闻内容', res)
with open('caijing.txt','a',encoding='utf8')as f:
f.write(res+'\n')