1.取出一个新闻列表页的全部新闻 包装成函数。
2.获取总的新闻篇数,算出新闻总页数。
3.获取全部新闻列表页的全部新闻详情。
import requests from bs4 import BeautifulSoup from datetime import datetime import locale import re def getClickCount(newsUrl): newsid = re.search(r"\_(.*).html", newsUrl).group(1)[-4:] clicktimesurl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid) clicktimes = int(requests.get(clicktimesurl).text.split(".html(")[-1].lstrip("'").rstrip("');")) return clicktimes def getNewsDetail(newsUrl): resdet = requests.get(newsUrl) resdet.encoding = 'utf-8' soupdet = BeautifulSoup(resdet.text, 'html.parser') contentdetail = soupdet.select('#content')[0].text showinfo = soupdet.select('.show-info')[0].text date = showinfo.lstrip("发布时间:")[:19] if (showinfo.find('作者') > 0): author = re.search('作者:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1) else: author = 'none' if (showinfo.find('审核') > 0): checker = re.search('审核:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1) else: checker = 'none' if (showinfo.find('来源') > 0): source = re.search('来源:(.*)\s*来|摄|点', showinfo).group(1) else: source = 'none' if (showinfo.find('摄影') > 0): photographer = re.search('摄影:(.*)\s点', showinfo).group(1) else: photographer = 'none' clicktimes = getClickCount(newsUrl) dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') print("发表时间:{0} 作者:{1} 审核:{2} 来源:{3} 摄像:{4} 点击次数:{5} 次".format(dateTime, author, checker, source, photographer, clicktimes)) print(contentdetail) def getListDetail(ListPageUrl): resl = requests.get(ListPageUrl) resl.encoding = 'utf-8' soupl = BeautifulSoup(resl.text, 'html.parser') for news in soupl.select('li'): if len(news.select('.news-list-title')) > 0: title = news.select('.news-list-title')[0].text description = news.select('.news-list-description')[0].text info = news.select('.news-list-info')[0].text address = news.select('a')[0]['href'] print("\n标题: {0}\n描述: {1}\n信息: {2}\n链接: {3}".format(title, description, info, address)) getNewsDetail(address) locale.setlocale(locale.LC_CTYPE, 'chinese') Listurl = "http://news.gzcc.cn/html/xiaoyuanxinwen/" res = requests.get(Listurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') ListCount = int(soup.select('.a1')[0].text.rstrip('条')) if (ListCount % 10 > 0): pagecount = ListCount // 10 + 1 else: pagecount = ListCount // 10 for i in range(1, pagecount + 1): if (i == 1): ListPageUrl = Listurl else: ListPageUrl = Listurl + '{}.html'.format(i) getListDetail(ListPageUrl)
4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。
import requests from bs4 import BeautifulSoup import jieba def getnewsdetail(newsurl): resd = requests.get(newsurl) resd.encoding = 'gbk' soupd = BeautifulSoup(resd.text, 'html.parser') total = len(soupd.select(".text")) content = '' for p in range(0, total): content += soupd.select('.text')[p].text + '\n' # 有部分为纯图片新闻所以加此判断语句来不分析纯图片新闻 if (total > 0): print(content + "\n词频统计如下:") delword = ['我', '他', '你', '了', '那', '又', '-', '的', '我们', '是', '但', '中', '这', '在', '也', '都', '而', '\n', ',', '。', '?', '!', '“', '”', ':', ';', '、', '.', '‘', '’', '(', ')', ' ', '【', '】', '…'] wordDict = {} newscontent = list(jieba.cut(content)) wordset = set(newscontent) - set(delword) for i in wordset: wordDict[i] = newscontent.count(i) sort = sorted(wordDict.items(), key=lambda item: item[1], reverse=True) for i in range(20): print(sort[i]) else: print('纯图片新闻') def getnewslist(newsurl): res = requests.get(newsurl) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') for newsList in soup.select('.newslist')[0].select('li'): title = newsList.select('a')[0].text publishtime = newsList.select('.pub_time')[0].text address = newsList.select('a')[0]['href'] print('\n标题:{0}\n发表时间:{1}\n新闻链接:{2}\n'.format(title, publishtime, address)) getnewsdetail(address) # 添加自定义词汇 jieba.add_word('维斯塔潘') jieba.add_word('维特尔') jieba.add_word("范多恩") jieba.add_word("加斯利") jieba.add_word("托斯特") jieba.add_word("小红牛") jieba.add_word("大红牛") jieba.add_word("库比卡") jieba.add_word("马格努森") jieba.add_word("倍耐力") url = "http://sports.qq.com/l/f1/allf1news/list20100311191657.htm" getnewslist(url) for i in range(1, 101): if (i == 1): getnewslist(url) else: newsurl = "http://sports.qq.com/l/f1/allf1news/list20100311191657_{}.htm".format(i) getnewslist(newsurl)