作业的要求来自于:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/2894
给定一篇新闻的链接newsUrl,获取该新闻的全部信息
标题、作者、发布单位、审核、来源
发布时间:转换成datetime类型
点击:
- newsUrl
- newsId(使用正则表达式re)
- clickUrl(str.format(newsId))
- requests.get(clickUrl)
- newClick(用字符串处理,或正则表达式)
- int()
整个过程包装成一个简单清晰的函数。
#coding = utf-8; import re; import requests; from datetime import datetime; from bs4 import BeautifulSoup; class News(object): ''' 广商校园新闻 ''' def __init__(self, url): self.url = url; #新闻网页地址 self._dom_tree = self._tranfrom_dom_tree(url); self._show_infos = self._dom_tree.select(".show-info")[0].text.split(); self._update_time = self._show_infos[0][5:] + " " + self._show_infos[1]; # 最后更新时间 def _tranfrom_dom_tree(self,url): ''' 将获取的html文本转化为dom树 ''' response = requests.get(url); response.encoding = "utf-8"; return BeautifulSoup(response.text, "html.parser"); # 新闻标题 @property def title(self): return self._dom_tree.select(".show-title")[0].text; # 新闻作者 @property def auothor(self): return self._show_infos[2][3:]; # 新闻审核 @property def auditor(self): return self._show_infos[3][3:]; # 新闻发布单位 @property def origin(self): self._show_infos[4][3:]; # 新闻最后更新时间 @property def update_time(self): return self._update_time; @update_time.setter def update_time(self, time): self._update_time = time; # 点击次数 @property def times(self): clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(self.news_id); res = requests.get(clickUrl) click = re.findall('(\d+)', res.text)[-1] return click # 新闻标识 @property def news_id(self): time = datetime.strptime(self._update_time, '%Y-%m-%d %H:%M:%S'); time = time.strftime("%m%d"); re.match('http://news.gzcc.cn/html/2019/.*/(\d+).html', self.url).group(1) if __name__ == "__main__": html_url = "http://news.gzcc.cn/html/2019/xiaoyuanxinwen_0322/11047.html"; news = News(html_url);