import bs4,requests headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'} res=requests.get('https://news.baidu.com',headers=headers) res.raise_for_status() soup=bs4.BeautifulSoup(res.text,'html.parser') items=soup.find_all('div',class_='mod-tab-content') fp=open('新闻.txt','a') for item in items: con=item.find_all('li') for i in con: # print(i.a.attrs['href'],i.a.text) res=requests.get(i.a.attrs['href'],headers=headers) # print(res.text) soup=bs4.BeautifulSoup(res.text,'html.parser') src=soup.select('.article-content') if src: # print(src.getText()) for sc in src: a=sc.getText() fp.write('\n\r'+i.a.attrs['href']+'-----------'+i.a.text+':'+'\n'+a+'\r\n') else: fp.write('\n\r' + i.a.attrs['href'] + '-----------' + i.a.text + ':' + '\r\n') fp.close()
重温了bs4模块内容,tag等内容还掌握的不好,继续联系