主要通过requests+bs4库抓取renminRB每日新闻标题
1先定义获取网页内容通用函数
def fetchUrl(url): headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } r = requests.get(url, headers) r.encoding = r.apparent_encoding return r.text
2.调用通用函数,获取renminRB当前日期各个版面的链接列表
def get_homepage(net_url): ''' 功能:获取当天报纸的各版面的链接列表 参数:年,月,日 ''' html = fetchUrl(net_url) bsobj = bs4.BeautifulSoup(html, 'html.parser') homepage = bsobj.find('div', attrs={'class': 'swiper-container'}).find_all('div', attrs={'class': 'swiper-slide'}) homepagelist = [] for page in homepage: homepagelist.append(page.a["href"]) return homepagelist
3.根据版面链接获取当前版面内的文章链接列表
def get_pageurl(page_url,base_url): ''' 功能:获取报纸某一版面的文章链接列表 参数:年,月,日,该版面的链接 ''' html = fetchUrl(page_url) bsobj = bs4.BeautifulSoup(html, 'html.parser') pageurl_list = bsobj.find('div', attrs={'class': 'news'}).ul.find_all('li') newsurl_list = [] for title in pageurl_list: tempList = title.find_all('a') for temp in tempList: link = temp["href"] if 'nw.D110000renmrb' in link: url = '{}/{}-{}/{}/{}'.format(base_url, year, month, day,link) newsurl_list.append(url) return newsurl_list
4.解析当前文章内容
def getContent(page_text): ''' 功能:解析 HTML 网页,获取内容 参数:html 网页内容 ''' bsobj = bs4.BeautifulSoup(page_text, 'html.parser') # 获取文章 标题 title = bsobj.h3.text + bsobj.h1.text + bsobj.h2.text # 获取文章 内容 pagecontent = bsobj.find('div', attrs={'id': 'ozoom'}).find_all('p') content = '' for v in pagecontent: content += v.text + '\n' # 返回结果 标题,内容 return title,content
8。最后通过主函数调用相关内容方
if __name__ == '__main__': ''' 主函数:程序入口 ''' base_url = 'h' t = datetime.date.today() year = str(t.year) month = str(t.month) if t.month < 10: month = '0' + month day = str(t.day) if t.day < 10: day = '0' + day homepage_url = '{}/{}-{}/{}/nbs.D110000renmrb_01.htm'.format(base_url, year, month, day) pageList = get_homepage(homepage_url) title_url = '{}/{}-{}/{}/'.format(base_url, year, month, day) for page in pageList: pageurl = title_url + page newsurl_list = get_pageurl(pageurl,base_url) for newsurl in newsurl_list: # 获取文章 html = fetchUrl(newsurl) title,content = getContent(html) print (title)
base_url地址请私信,或者回复