接着上面章节 根据新闻的URL 拿到新闻内详情
#2- 获取新闻内详情 入参: http://news.sina.com.cn/c/nd/2018-01-12/doc-ifyqqciz6031881.shtml def getNewsDetail(newsURL): newsModel = {} reContent = requests.get(newsURL) reContent.encoding = 'utf-8' soupContent = BeautifulSoup(reContent.text, 'html.parser') # 获取newsURL print(newsURL) # 新闻ID match = re.search('doc-i(.*?).shtml', newsURL) newsID = match.group(1) print(newsID) # 新闻标题 title = soupContent.select('.main-title')[0].text print(title) # 获取时间 time = soupContent.select('.date-source span')[0].text print(time) # 获取来源 source = '' if len(soupContent.select('.date-source a')) > 0: source = soupContent.select('.date-source a')[0].text print(source) elif len(soupContent.select('.source')) > 0: source = soupContent.select('.source')[0].text print(source) else: print('当前未检测到来源', newsURL) #获取内容 article = ''.join([article.text.strip() for article in soupContent.select('.article p')]) # for article in soupContent.select('.article p'): # print(article.text) print(article) #获取编辑/作者 show_author = soupContent.select('.show_author')[0].text print(show_author) newsModel['newsID'] = newsID newsModel['newsHref'] = newsURL newsModel['title'] = title newsModel['time'] = time newsModel['source'] = source newsModel['article'] = article newsModel['show_author'] = show_author return newsModel