思路:通过爬取国内新闻的门户,获取所有的url,之后对url去重,利用循环,爬取所有的url正文,如果不能爬取,则跳过。
模块:requests,pymysql,BeautifulSoup,datetime
门户:url,headers
newsurl2 = 'https://news.sina.com.cn/china/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Referer':newsurl2 }
获取url的函数 def getUrl(newsurl2, headers): urlList = [] r = requests.get(newsurl2, headers = headers) if r.status_code == 200: r.encoding = 'utf-8' rsoup = BeautifulSoup(r.text, 'html.parser') resultUrl = rsoup.find_all('a') for i in resultUrl: t = i.get('href') urlList.append(t) return list(set(urlList)) # 去除重复的url else: print(404)
数据库操作:
class DatabaseAccess(): def __init__(self): self.__db_host = "localhost" self.__db_port = 3306 self.__db_user = "root" self.__db_password = "123456" self.__db_database = "school" def isConnectionOpen(self): self.__db = pymysql.connect( host=self.__db_host, port=self.__db_port, user=self.__db_user, password=self.__db_password, database=self.__db_database, charset='utf8' ) def linesInsert(self, urli, titlei, timei, editori, articlei): try: self.isConnectionOpen() global cursor cursor = self.__db.cursor() sql = "insert into news(url,title,time,editor,article) value(%s,%s,%s,%s,%s)" cursor.execute(sql, (urli, titlei, timei, editori, articlei)) print("插入成功") except Exception as e: print("插入异常") finally: cursor.close() self.__db.commit() self.__db.close()
爬取单个网页:
# 处理单个网页,并提取其中的信息 def getNewsDetail(newsurl, headers): result = {} res = requests.get(newsurl, headers = headers) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') [s.extract() for s in soup('script')] result['url'] = newsurl result['title'] = soup.select('.main-title')[0].text #查找 class = main-title 的主标题 timeSource = soup.select('.date-source span')[0].text #组合查找class = date-source 和 span 标签中的时间 result['time'] = datetime.strptime(timeSource, '%Y年%m月%d日 %H:%M').strftime('%Y-%m-%d') article = [] for p in soup.select('#article p')[:-1]: #组合查找id = article 和 p标签中的正文 article.append('<p>' + p.text.strip() + '</p>') articleAll = ' '.join(article) result['article'] = articleAll result['editor'] = soup.select('.show_author')[0].text.strip('责任编辑:') print("网页内容提取成功") return result
循环爬取url,存入数据库,其中正文中,每一行加上《p》标签,以便于新闻在html中显示: # 爬取新闻存入数据库 def getSQL(list): count = 0 db = DatabaseAccess() for i in iter(list): count += 1 urli = i print(i) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Referer': urli } try: data = getNewsDetail(urli, header) print(data['title']) db.linesInsert(data['url'], data['title'], data['time'], data['editor'], data['article']) except: print(666)