大作业:爬取腾讯新闻
- 了解ajax加载
- 通过chrome的开发者工具,监控网络请求,并分析
- 用selenium完成爬虫
- 具体流程如下:
用selenium爬取https://news.qq.com/ 的热点精选
import time
from selenium import webdriver
from bs4 import BeautifulSoup
# -*- coding:utf-8 -*-
#Python抓取网页必备的库
import urllib
import urllib2
#正则表达式
import re
#随机数生成
import random
#gzip
import gzip
from StringIO import StringIO
#构建页面请求的头部
headers = {'User-Agent':user_agent, "Referer":referer}
#构建页面请求
request = urllib2.Request(url, headers=headers)
#请求目的页面,设置超时时间为45秒
response = urllib2.urlopen(request, timeout = 45)
#如果经过gzip压缩则先解压,否则直接读取
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
html = f.read()
else:
html = response.read()
#user-agent
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
#抓取页面接口,参数为地址和referer
def getHtml(url, referer = None):
try:
#构建页面请求的头部
headers = {'User-Agent':user_agent, "Referer":referer}
#构建页面请求
request = urllib2.Request(url, headers=headers)
#请求目的页面,设置超时时间为45秒
response = urllib2.urlopen(request, timeout = 45)
html = None
#如果经过gzip压缩则先解压,否则直接读取
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
html = f.read()
else:
html = response.read()
return html
#如果请求异常
except urllib2.URLError, e:
if hasattr(e, "code"):
print e.code
elif hasattr(e, "reason"):
print e.reason
return None
#其他异常
except Exception,e:
return None
def tencentStart():
#腾讯新闻地址
INDEX_URL = 'http://news.qq.com/top_index.shtml#hotnews'
#腾讯要闻请求地址
SUB_URL = "http://news.qq.com/c/2013ywList_{0}.htm"
#页面数获取正则
PAGE_PATTERNS = 'getString.pageCount.*?=.*?(\d+);'
#标题和链接获取正则
NEWS_PATTERNS = '<em.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</em>'
#头部信息相关
TENCENT_REFER = "http://news.qq.com/"
#获取腾讯新闻页面
html = getHtml(INDEX_URL)
#取得要闻页面总数
pattern = re.compile(PAGE_PATTERNS, re.S)
countRe = re.search(pattern, html)
if html == None:
print("未获取到页面")
return None
count = 1
if countRe != None:
count = int(countRe.group(1))
#构建分页地址,请求分页数据
for index in range(count):
realIndex = index + 1
#构建地址
url = SUB_URL.format(realIndex)+'?'+str(random.random())
html = getHtml(url, TENCENT_REFER)
if html == None:
continue
#编译标题和链接获取正则
pattern = re.compile(NEWS_PATTERNS, re.S)
#获取所有标题和链接
Res = re.findall(pattern, html)
if Res == None:
continue
#打印所有标题和链接
for item in Res:
print(item[0]+"\n")
print(item[1]+"\n")
if __name__ == '__main__':
tencentStart()
进阶加餐-知乎爬虫
链接如下
https://www.zhihu.com/search?q=Datawhale&utm_content=search_history&type=content
用requests库实现,不能用selenium网页自动化
提示:
该链接需要登录,可通过github等,搜索知乎登录的代码实现,并理解其中的逻辑,此任务允许复制粘贴代码
与上面ajax加载类似,这次的ajax加载需要用requests完成爬取,最终存储样式随意,但是通过Chrome的开发者工具,分析出ajax的流程需要写出来