urllib
urllib 提供了urllib.request、urllib.response、urllib.parse、robotparse和urllib.error五个模块,支支持网页内容读取。
1.读取并显示网页内容
import urllib.request
fp = urllib.request.urlopen(r'http://www.python.org')
print(fp.read(100)) #读取100个字节
print(fp.read(100).decode()) #使用UTF8进行解码
fp.close() #关闭连接
2.提交网页参数
import urllib.request
import urllib.parse
params = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
url = "http://www.musi-cal.com/cgi-bin/query?%s" % params
with urllib.request.urlopen(url) as f:
print(f.read().decode('utf-8'))
基本应用
用post方法提交参数
import urllib.request
import urllib.parse
data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
data = data.encode('ascii')
with urllib.request.urlopen("http://request.in/xrbl82xr", data) as f:
print(f.read().decode('utf-8'))
使用HTTP代理访问页面
import urllib.request
proxies = {'http': 'http://proxy.example.com:8080/'}
opener = urllib.request.FancyURLopener(proxies)
with opener.open("https:www.python.org") as f:
f.read().decode('utf-8')
爬虫案例
爬取公众号文章地址
1.确定公众号文章地址
https://mp.weixin.qq.com/s?__biz=MzI4MzM2MDgyMQ==&mid=2247486249&idx=1&sn=a37d079f541b194970428fb2fd7a1ed4&chksm=eb8aa073dcfd2965f2d48c5ae9341a7f8a1c2ae2c79a68c7d2476d8573c91e1de2e237c98534&scene=21#wechat_redirect
2.在浏览器中打开文章,分析网页源码 公众号文章图片链接格式
<p><img data-s="300,640" data-type="png" data-src="http://mmbiz.qpic.cn/mmbiz_png/xXrickrc6JTO9TThicnuGGR7DtzWtslaBlYS5QJ73u2WpzPW8KX8iaCdWcNYia5YjYpx89K78YwrDamtkxmUXuXJfA/0?wx_fmt=png" style="" class="" data-ratio="0.5580865603644647" data-w="878" /></p>
3.确定正则表达式
pattern = 'data-type="png"data-src="(.+?)"'
4.编写python爬虫程序
from re import findall
from urllib.request import urlopen
url = 'https://mp.weixin.qq.com/s?__biz=MzI4MzM2MDgyMQ==&mid=2247486249&idx=1&sn=a37d079f541b194970428fb2fd7a1ed4&chksm=eb8aa073dcfd2965f2d48c5ae9341a7f8a1c2ae2c79a68c7d2476d8573c91e1de2e237c98534&scene=21#wechat_redirect'
with urlopen(url) as fp:
content = fp.read().decode()
pattern = 'data-type="png" data-src="(.+?)"'
# 查找所有图片链接地址
result = findall(pattern, content)
for index,item in enumerate(result):
with urlopen(str(item)) as fp:
with open(str(index)+'.png','wb')as fp1:
fp1.write(fp.read())
scrapy爬虫
1.使用pip命令安装好scrapy之后,在命令提示符环境中执行下面命令 创建项目MyCraw
scrapy startproject MyCraw
cd MyCraw
scrapy genspider MySpider www.sdibt.edu.cn
2.编写python程序用于爬取指定页面内容
code\MySpider.py
3.运行爬虫程序
scrapy crawl mySpider
爬取天涯小说
1.确定链接
http://bbs.tianya.cn/post-16-1126849-1.shtml
2.查看分析网页源码,确定作者,确定如何查找作者发表 的帖子而过滤其他帖子,并确定该小说不同的url之间规律
3.创建爬虫项目
1.进入命令行模式 切换到Python安装目录的scripts目录 执行命令scrapy startproject xiaoshuo
2.进入目录,创建爬虫项目 scrapy genspider spiderYichangGuishi 修改和补充python程序
3.执行命令 scrapy crawl spiderYichang
4.稍后便可爬取小说全文并生成记事本文档
code\spiderYiChang.py
BeautifulSoup用法简介
from bs4 import BeautifulSoup
BeautifulSoup('hello world','lxml')
<html><body><p>hello world!</p></body></html>
BeautifulSoup('<span>hello world','lxml')
<html><body><span>helloworld!</span></body></html>
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="class">The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister"
id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister"
id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify)
soup = BeautifulSoup(html_doc, 'html.parser') #也可以指定lxml或其他解析器
print(soup.prettify()) #以优雅的方式显示出来
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b> The Dormouse's story </b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
Elsie
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body>
</html>
>>>
soup.title #访问<title>标签的内容
<title>The Dormouse's story</title>
>>>soup.title.name #查看标签的名字
'title'
>>>soup.title.text #查看标签的文本
"The Dormouse's story"
>>>soup.title.string #查看标签的文本
"The Dormouse's story"
>>>soup.title.parent #查看上一级标签
<head><title>The Dormouse's story</title></head>
>>>soup.head #查看<head>标签的内容
<head><title>The Dormouse's story</title></head>
>>>soup.b #访问<b>标签的内容
<b>The Dormouse's story</b>
>>>soup.body.b #访问<body>中<b>标签的内容
<b>The Dormouse's story</b>
>>>soup.name #把整个BeautifulSoup对象看作标签对象
'[document]'
soup.body
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters;and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a
class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
andthey lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
import re
soup.findall(href=re.compile("elsie"))
soup.find(id='link3')
soup.find_all('a',id='link3')
for link in soup.find_all('a'):
print(link.text,':',link.get('href'))
print(soup.get_text())
soup.a['id'] = 'test_link1'
soup.a
soup.a.string.replace_with('test_Elsie')
soup.a.string
for child in soup.body.children:
print(child)
test_doc = '<html><head></head><body><p></p><p></p></body></heml>'
>>> s = BeautifulSoup(test_doc, 'lxml')
>>> for child in s.html.children: #遍历直接子标签
print(child)
<head></head>
<body><p></p><p></p></body>
>>> for child in s.html.descendants: #遍历子孙标签
print(child)
<head></head>
<body><p></p><p></p></body>
<p></p>
<p></p>
request基本操作
增加头部并设置访问代理
url = 'https://api.github.com/some/endpoint'
headers = {'user-agent':'my-app/0.0.1'}
r = requests.get(url,headers=headers)
访问网页并提交数据
payload = {'key1':'value1','key2':'value2'}
r = requests.post("http://httpbin.org/post",data=payload)
print(r.text)
url = 'https://api.github.com/some/endpoint'
payload = {'some':'data'}
r = requests.post(url,json=payload)
print(r.text)
print(r.headers)
print(r.headers['Content-Type'])
print(r.headers['Content-Encoding'])
获取和设置cookies
r = requests.get("http://www.baidu.com/")
r.cookies
设置cookies参数
url = 'http://httpbin.org/cookies'
cookies = dict(cookies_are='working')
r = requests.get(url, cookies=cookies)
print(r.text)
使用requests库爬取微信公众号“Python小屋”文章“Python使用集合实现素数筛选法”中的所有超链接。
import re
links = re.findall(r'<a .*?href="(.+?)"',r.text)
for link in links:
if link.startswith('http'):
print(link)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.content,'lxml')
for link in soup.findAll('a'):
href = link.get('href')
if href.startswitch('http'):
print(href)
#下载图片
import requests
picUrl = r'https://www.python.org/static/opengraph-icon-200x200.png'
r.status_code
with open('pic.png','wb')as fp:
fp.write(r.content)