学习《python3 网络爬虫开发实战 崔庆才 》
第1章,环境,稍微配一些基本的,略过
第2章,前端三大部分:CSS, JS, Html
http基础原理
爬虫基本原理,想起这个。。。。
当你在浏览器中输入baidu.com并且按下回车之后发生了什么?
第3章,一些书中的代码
urllib繁琐,需要构造hanlder,再构造opener,再进行请求
requests好多了
书中一些代码:
#coding=utf-8
# 3.1 使用urllib 代理
from urllib.request import ProxyHandler, build_opener
from urllib.error import URLError
proxy_handler = ProxyHandler({
'http': '106.75.164.15:3128',
'https': '106.75.164.15:3128'
})
opener = build_opener(proxy_handler)
try:
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
except URLError as e:
print(e.reason)
# 3.1 使用urllib Cookies
import urllib.request
import http.cookiejar
file_name = 'baidu_cookies.txt'
cookie = http.cookiejar.LWPCookieJar(file_name)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
rsp = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
# 3.1.2 处理异常
from urllib import request
from urllib import error
try:
resp = request.urlopen("https://www.google.com.hk")
except error.HTTPError as e:
print(e)
print(e.reason)
except error.URLError as e:
print(e.reason)
else:
print("success")
# 3.1.3 解析链接
from urllib.parse import urlparse,urlsplit
result = urlsplit('https://tieba.baidu.com/f?kw=%E6%97%85%E6%B8%B8&red_tag=v2983652949')
print(result)
3.2 使用requests
import requests
date = {
'class' : 'Person',
'object': 'person'
}
req = requests.get('http://httpbin.org/get',params=date)
print(req.json())
print(req.text)
# 3.2使用requests 抓取网页
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36'
}
resp = requests.get('http://www.zhihu.com/explore', headers=headers)
print(resp.headers)
pattern = re.compile('explore-feed.*?question_link.*?>(.*?)</a>', re.S)
titles = re.findall(pattern,resp.text)
print(titles)
# Cookie 登录
import requests
headers = {
'Cookie': '',
'Host': 'www.zhihu.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
resp = requests.get('http://www.zhihu.com',headers=headers)
print(resp.text)
# session 保持会话
import requests
se = requests.session()
se.get('http://httpbin.org/cookies/set/number/124')
req = se.get('http://httpbin.org/cookies')
print(req.text)
# SSL证书
import requests
import urllib3
urllib3.disable_warnings() #忽略证书建议警告
resp = requests.get('https://www.12306.cn', verify = False) #忽略证书验证
print(resp.status_code)
# 5.代理,本实例无法连接:由于目标计算机积极拒绝,无法连接
import requests
my_proxies={
'http': 'http://60.216.101.46:59351',
'https': 'http://222.173.108.70:47571'
}
resp = requests.get('https://www.taobao.com', proxies=my_proxies)
print(resp.text)
# 6.超时,等待服务器时间,等不到就报错
import requests
import time
resp = requests.get('https://www.baidu.com', timeout=1)
print(resp.status_code)
# 3.3正则表达式
import re, requests
resp = requests.get('https://music.163.com')
pattern = re.compile('http.*?jpg.*?')
result = re.findall(pattern, resp.text)
print(result)
# 3.4 抓取猫眼电影,此处改为抓取影院及地址信息
import requests
import re
import sys
def get_pg(n): # 获取第n个页面html
url = 'http://maoyan.com/cinemas?offset='+str(n*12) # 一页12条数据
headers_maoyan = { # 加上请求头,防反爬
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36'
}
resp = requests.get(url, headers=headers_maoyan)
if resp.status_code == 200:
return resp.text
else:
print(resp.status_code,resp.reason)
sys.exit(1) # 如果错误,则打印状态码和原因后退出
def get_cinema(m): # 在m个HTML中,找出目标信息
cinema = [] # 影院和地址用列表存储
addr = []
pattern_cinema = re.compile('cinema_id: \d{1,6}}">.*?<')
pattern_addr = re.compile('cinema-address.*?<')
for n in range(0, m):
cont_cinema = re.findall(pattern_cinema, get_pg(n))
cont_addr = re.findall(pattern_addr, get_pg(n))
for i in cont_cinema:
cinema.append(i[:-1].split('>')[1])
for j in cont_addr:
addr.append(j[:-1].split('>')[1])
return tuple(zip(cinema, addr)) # 将影院和地址转化为元组列表,并返回
def main():
result = get_cinema(2) # 存储2个页面
print(result) # 结果根据定位城市而不同
if __name__ == '__main__':
main()