基本库的使用urllib
urllib包含了四个模块:
(1)request,http请求模块
(2)error,异常处理模块
(3)parse,url处理的工具模块
(4)robotparser,识别网站的robots.txt文件
1.request
from urllib import request, parse, error
import socket
'''
request.urlopen(url,data,timeout,cafile,capath,cadefault,context) url是必须的参数
'''
data = bytes(parse.urlencode({
'user': '1300330101'},), encoding='utf-8')
try:
res = request.urlopen('http://148.70.139.25/login_sure/', data=data, timeout=1)
'''
res:read() readinto() getheaders() getheader(name) fileno()
msg version status reason close
'''
# print(res.read().decode('utf-8'))
print(type(res))
print(res.getheaders())
print(res.getheader('Server'))
except error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('TIME OUT')
'''
复杂的构造请求
req=request.Request(url,data=None,headers={},origin_req_host=None,unverifiable=False,method=None)
response=request.urlopen(req)
'''
2.requests
学了这个基本不用request了
import requests
from requests.auth import HTTPBasicAuth
'''
打开的方式
r=requests.get()
requests.post()
requests.head()
requests.put()
requests.delete()
requests.options()
r.text r.cookies r.status_code r.json r.content
r.headers r.encoding r.reason r.close() r.history
'''
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.116 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
'Cookie':''
}
r = requests.get('https://www.baidu.com', headers=header, verify=False)# 不进行ssl认证
r.encoding = 'utf-8'
'''
基本用法见https://blog.csdn.net/qq_33564134/article/details/88818172
'''
# 文件的上传
files = {
'file': open('names', 'rb')}
r = requests.post('http://httpbin/post', files=files)
# 代理设置
proxies = {
'http': 'http://10.10.10.1:2123',
'https': 'htttps://1.1.1.1:2090',
}
r = requests.post('url', proxies=proxies, timeout=1)
# 身份认证
r = requests.post('url',auth=HTTPBasicAuth('username', 'password'))
import requests
import re
import json
import time
def get_one_page(url):
m_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.116 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4'
}
res = requests.get(url, headers=m_header)
res.encoding = 'utf-8'
if res.status_code == 200:
return res.text
return None
def parse_one_page(html):
pattern = re.compile(r'<dd>.*?board-index.*?>(\d+)</i>.*?' # index排名
r'<p class="name">.*?<a href.*?data-val.*?>(.*?)</a>.*?' # name名字
r'<p class="star".*?>(.*?)</p>.*?' # star主演
r'<p class="releasetime".*?>(.*?)</p>.*?' # releasetime时间
r'.*?</dd>', re.S)
items = re.findall(pattern, html)
print(items)
for item in items:
# yield的使用,长见识,长见识-----取出阻塞值yield{}可以给下个函数使用,少去了list的传递
yield {
'index': item[0].strip(),
'name': item[1].strip(),
'star': item[2].strip(),
'releasetime': item[3].strip(),
}
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main(offset):
url = 'https://maoyan.com/board/4?offset='+str(offset)
print(url)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(0, 10):
main(i*10)
time.sleep(2)
解析库的使用
Beautiful Soup
如果说寻找特定的内容
from bs4 import BeautifulSoup
import lxml
import re
# soup的种类 tag标签 name标签的名字 Attributes标签的属性 内容
# tag = soup.b tag.name tag['class']|tag.attrs tag.string
# 遍历:
# contents 将子节点全部列出
# len(soup.contents)有多少个子节点
# find_all(tag,)
#
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...
</body>
"""
# 创建soup
soup = BeautifulSoup(html, 'lxml')
# print(soup.prettify()) # 自动修补格式,填充完成
# print(soup.body.contents) # body包含的全部内容
# print(soup.head.contents[0].string) # 找到标签title 读取内容
# print(soup.find_all('a')) # a的全部标签
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
AJAX数据的爬取
AJAX 不是新的编程语言,而是一种使用现有标准的新方法。
AJAX 最大的优点是在不重新加载整个页面的情况下,可以与服务器交换数据并更新部分网页内容。
<script>
function loadXMLDoc()
{
var xmlhttp;
if (window.XMLHttpRequest)
{
// IE7+, Firefox, Chrome, Opera, Safari 浏览器执行代码
xmlhttp=new XMLHttpRequest();
}
else
{
// IE6, IE5 浏览器执行代码
xmlhttp=new ActiveXObject("Microsoft.XMLHTTP");
}
xmlhttp.onreadystatechange=function()
{
if (xmlhttp.readyState==4 && xmlhttp.status==200)
{
document.getElementById("myDiv").innerHTML=xmlhttp.responseText;
}
}
xmlhttp.open("GET","/try/ajax/demo_get.php",true);
xmlhttp.send();
}
</script>
import requests
from urllib.parse import urlencode
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/p/2304131618051664_-_WEIBO_SECOND_PROFILE_WEIBO',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
max_page = 10
def get_page(page):
params = {
'page_type': '03',
'containerid': '2304131618051664_-_WEIBO_SECOND_PROFILE_WEIBO',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json(), page
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json, page: int):
print(json)
if json:
items = json.get('data').get('cards')
for index, item in enumerate(items):
if page == 1 and index == 1:
continue
else:
item = item.get('mblog', {
})
weibo = {
}
weibo['id'] = item.get('id')
weibo['title'] = item.get('page_info', {
}).get('content1')
weibo['content'] = item.get('page_info', {
}).get('content2')
yield weibo
if __name__ == '__main__':
for page in range(1, max_page + 1):
json = get_page(page)
results = parse_page(*json)
for result in results:
print(result)
import requests
from urllib.parse import urlencode
from requests import codes
import os
from hashlib import md5
from multiprocessing.pool import Pool
import re
def get_page(offset):
params = {
'aid': '24',
'offset': offset,
'format': 'json',
#'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis'
}
base_url = 'https://www.toutiao.com/api/search/content/?keyword=%E8%A1%97%E6%8B%8D'
url = base_url + urlencode(params)
try:
resp = requests.get(url)
print(url)
if 200 == resp.status_code:
print(resp.json())
return resp.json()
except requests.ConnectionError:
return None
def get_images(json):
if json.get('data'):
data = json.get('data')
for item in data:
if item.get('cell_type') is not None:
continue
title = item.get('title')
images = item.get('image_list')
for image in images:
origin_image = re.sub("list", "origin", image.get('url'))
yield {
'image': origin_image,
# 'iamge': image.get('url'),
'title': title
}
print('succ')
def save_image(item):
img_path = 'img' + os.path.sep + item.get('title')
print('succ2')
if not os.path.exists(img_path):
os.makedirs(img_path)
try:
resp = requests.get(item.get('image')) # 根据图片的url得到图片
if codes.ok == resp.status_code:
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
file_name=md5(resp.content).hexdigest(),
file_suffix='jpg')
if not os.path.exists(file_path):
print('succ3')
with open(file_path, 'wb') as f:
f.write(resp.content)
print('Downloaded image path is %s' % file_path)
print('succ4')
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Failed to Save Image,item %s' % item)
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
GROUP_START = 0
GROUP_END = 7
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups) # 线程池开启
pool.close()
pool.join()
result:
动态渲染页面的爬取
splash的使用
安装:
sudo apt install docker
sudo vim /etc/docker/daemon.json
{
“registry-mirrors”: [
“http://hub-mirror.c.163.com”,
“https://registry.docker-cn.com/”
]
}
sudo docker pull scrapinghub/splash # 安装
docker run -p 8050:8050 scrapinghub/splash # 运行
Lua脚本的使用
print("Hello World!")
-- test.lua 文件脚本
a = 5 -- 全局变量
local b = 5 -- 局部变量
function joke()
c = 5 -- 全局变量
local d = 6 -- 局部变量
end
joke()
print(c,d) --> 5 nil
do
local a = 6 -- 局部变量
b = 6 -- 对局部变量重新赋值
print(a,b)