import urllib.request
from bs4 import BeautifulSoup
#如果没有安装好BeautifulSoup,这里是会报错的
#自定义一个函数拿到博客的链接
def getUrl (url):
#定义一个headers,存储刚才复制下来的报头,模拟成浏览器
headers = ('User-Agent',
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
# 将opener安装为全局
urllib.request.install_opener(opener)
html = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
# print(html)
bs = BeautifulSoup(html,'lxml')
# 用beautifulsoup的select,找到所有的<a>标签
links = bs.select('.list04 > li > p > a')
return links
import sys
if __name__ == '__main__':
# 要爬取的网页链接 ,循环足够多的页数,所以填了1000000
for i in range(1,1000000):
url = 'https://finance.sina.com.cn/stock/'.format(i)
# 获取对应网页的链接地址
linklist = getUrl(url)
# 定义一个列表texts存储文章的标题
texts = []
# 定义一个列表links存储文章的链接
links = []
# 遍历linkllist,存储标题和链接
for link in linklist:
texts.append(link.text.strip())
links.append(link.get('href'))
# 通过zip,将信息输出到控制台
for text, link in zip(texts, links):
text = text.strip().replace("原 \n ", "")
text = text.strip().replace("转 \n ", "")
data = {'tittle': text, 'link': link}
print(data)