爬虫 Task 4

godli_one

于 2020-04-27 22:18:27 发布

阅读量137

点赞数

本文链接：https://blog.csdn.net/qq_24059779/article/details/105801109

版权

大作业：爬取腾讯新闻

了解ajax加载
通过chrome的开发者工具，监控网络请求，并分析
用selenium完成爬虫
具体流程如下：
用selenium爬取https://news.qq.com/ 的热点精选

import time
from  selenium import webdriver
from bs4 import BeautifulSoup
# -*- coding:utf-8 -*-
 
#Python抓取网页必备的库
import urllib 
import urllib2
#正则表达式
import re
#随机数生成
import random
#gzip
import gzip
from StringIO import StringIO

#构建页面请求的头部
headers = {'User-Agent':user_agent, "Referer":referer}
#构建页面请求
request = urllib2.Request(url, headers=headers)
#请求目的页面，设置超时时间为45秒
response =  urllib2.urlopen(request, timeout = 45)

#如果经过gzip压缩则先解压,否则直接读取
if response.info().get('Content-Encoding') == 'gzip':
	buf = StringIO(response.read())
	f = gzip.GzipFile(fileobj=buf)
	html = f.read()
else:
	html = response.read()

#user-agent
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
 
#抓取页面接口,参数为地址和referer
def getHtml(url, referer = None):
	try:
		#构建页面请求的头部
		headers = {'User-Agent':user_agent, "Referer":referer}
		#构建页面请求
		request = urllib2.Request(url, headers=headers)
		#请求目的页面，设置超时时间为45秒
		response =  urllib2.urlopen(request, timeout = 45)
 
		html = None
		#如果经过gzip压缩则先解压,否则直接读取
		if response.info().get('Content-Encoding') == 'gzip':
			buf = StringIO(response.read())
			f = gzip.GzipFile(fileobj=buf)
			html = f.read()
		else:
			html = response.read()
 
		return html
	#如果请求异常
	except urllib2.URLError, e:
		if hasattr(e, "code"):
			print e.code
		elif hasattr(e, "reason"):
			print e.reason
		return None
	#其他异常
	except Exception,e:
		return None
def tencentStart():
	#腾讯新闻地址
	INDEX_URL = 'http://news.qq.com/top_index.shtml#hotnews'
	#腾讯要闻请求地址
	SUB_URL = "http://news.qq.com/c/2013ywList_{0}.htm"
	#页面数获取正则
	PAGE_PATTERNS = 'getString.pageCount.*?=.*?(\d+);'
	#标题和链接获取正则
	NEWS_PATTERNS = '<em.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</em>'
	#头部信息相关
	TENCENT_REFER = "http://news.qq.com/"
	#获取腾讯新闻页面
	html = getHtml(INDEX_URL)
	#取得要闻页面总数
	pattern = re.compile(PAGE_PATTERNS, re.S)
	countRe = re.search(pattern, html)
	if html == None:
		print("未获取到页面")
		return None
	count = 1
	if countRe != None:
		count = int(countRe.group(1))
	#构建分页地址，请求分页数据
	for index in range(count):
		realIndex = index + 1
		#构建地址
		url = SUB_URL.format(realIndex)+'?'+str(random.random())
		html = getHtml(url, TENCENT_REFER)
		if html == None:
			continue
		#编译标题和链接获取正则
		pattern = re.compile(NEWS_PATTERNS, re.S)
		#获取所有标题和链接
		Res = re.findall(pattern, html)
		if Res == None:
			continue
		#打印所有标题和链接
		for item in Res:
			print(item[0]+"\n")
			print(item[1]+"\n")
 
 
if __name__ == '__main__':
	tencentStart()

进阶加餐-知乎爬虫

链接如下
https://www.zhihu.com/search?q=Datawhale&utm_content=search_history&type=content
用requests库实现，不能用selenium网页自动化
提示：
该链接需要登录，可通过github等，搜索知乎登录的代码实现，并理解其中的逻辑，此任务允许复制粘贴代码
与上面ajax加载类似，这次的ajax加载需要用requests完成爬取，最终存储样式随意，但是通过Chrome的开发者工具，分析出ajax的流程需要写出来