python3爬虫

最新推荐文章于 2022-04-01 01:46:43 发布

zylgbin

最新推荐文章于 2022-04-01 01:46:43 发布

阅读量2.3k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_33564134/article/details/89026150

版权

基本库的使用urllib

urllib包含了四个模块：
（1）request，http请求模块
（2）error，异常处理模块
（3）parse，url处理的工具模块
（4）robotparser，识别网站的robots.txt文件

1.request

from urllib import request, parse, error
import socket
'''
request.urlopen(url,data,timeout,cafile,capath,cadefault,context)       url是必须的参数



'''
data = bytes(parse.urlencode({
   'user': '1300330101'},), encoding='utf-8')
try:
    res = request.urlopen('http://148.70.139.25/login_sure/', data=data, timeout=1)
    '''
    res:read()  readinto()      getheaders()        getheader(name)     fileno()
        msg     version         status              reason          close     
    '''
    # print(res.read().decode('utf-8'))
    print(type(res))
    print(res.getheaders())
    print(res.getheader('Server'))
except error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

'''
复杂的构造请求
req=request.Request(url,data=None,headers={},origin_req_host=None,unverifiable=False,method=None)
response=request.urlopen(req)
'''

2.requests
学了这个基本不用request了

import requests
from requests.auth import HTTPBasicAuth
'''
打开的方式
r=requests.get()
requests.post()
requests.head()
requests.put()
requests.delete()
requests.options()

r.text      r.cookies       r.status_code       r.json      r.content
r.headers   r.encoding      r.reason            r.close()   r.history
'''
header = {
   
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/48.0.2564.116 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
    'Cookie':''
}
r = requests.get('https://www.baidu.com', headers=header, verify=False)# 不进行ssl认证
r.encoding = 'utf-8'
'''
基本用法见https://blog.csdn.net/qq_33564134/article/details/88818172
'''

# 文件的上传
files = {
   'file': open('names', 'rb')}
r = requests.post('http://httpbin/post', files=files)

# 代理设置
proxies = {
   
    'http': 'http://10.10.10.1:2123',
    'https': 'htttps://1.1.1.1:2090',
}
r = requests.post('url', proxies=proxies, timeout=1)

# 身份认证
r = requests.post('url',auth=HTTPBasicAuth('username', 'password'))

import requests
import re
import json
import time


def get_one_page(url):
    m_header = {
   
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/48.0.2564.116 Safari/537.36',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4'
    }
    res = requests.get(url, headers=m_header)
    res.encoding = 'utf-8'
    if res.status_code == 200:
        return res.text
    return None


def parse_one_page(html):
    pattern = re.compile(r'<dd>.*?board-index.*?>(\d+)</i>.*?'                      # index排名
                         r'<p class="name">.*?<a href.*?data-val.*?>(.*?)</a>.*?'   # name名字
                         r'<p class="star".*?>(.*?)</p>.*?'                         # star主演
                         r'<p class="releasetime".*?>(.*?)</p>.*?'                  # releasetime时间
                         r'.*?</dd>', re.S)
    items = re.findall(pattern, html)
    print(items)
    for item in items:
        # yield的使用，长见识，长见识-----取出阻塞值yield{}可以给下个函数使用，少去了list的传递
        yield {
   
            'index': item[0].strip(),
            'name': item[1].strip(),
            'star': item[2].strip(),
            'releasetime': item[3].strip(),
        }


def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def main(offset):
    url = 'https://maoyan.com/board/4?offset='+str(offset)
    print(url)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == '__main__':
    for i in range(0, 10):
        main(i*10)
        time.sleep(2)

在这里插入图片描述

解析库的使用

Beautiful Soup
如果说寻找特定的内容

from bs4 import BeautifulSoup
import  lxml
import re

# soup的种类 tag标签          name标签的名字   Attributes标签的属性              内容
#           tag = soup.b    tag.name        tag['class']|tag.attrs          tag.string
# 遍历：
# contents 将子节点全部列出
# len(soup.contents)有多少个子节点
# find_all(tag,)
#
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...
</body>
"""
# 创建soup
soup = BeautifulSoup(html, 'lxml')
# print(soup.prettify()) # 自动修补格式，填充完成
# print(soup.body.contents) # body包含的全部内容
# print(soup.head.contents[0].string) # 找到标签title 读取内容
# print(soup.find_all('a'))   # a的全部标签
for tag in soup.find_all(re.compile("^b")):
   print(tag.name)

AJAX数据的爬取

AJAX 不是新的编程语言，而是一种使用现有标准的新方法。
AJAX 最大的优点是在不重新加载整个页面的情况下，可以与服务器交换数据并更新部分网页内容。

<script>
function loadXMLDoc()
{
   
	var xmlhttp;
	if (window.XMLHttpRequest)
	{
   
		// IE7+, Firefox, Chrome, Opera, Safari 浏览器执行代码
		xmlhttp=new XMLHttpRequest();
	}
	else
	{
   
		// IE6, IE5 浏览器执行代码
		xmlhttp=new ActiveXObject("Microsoft.XMLHTTP");
	}
	xmlhttp.onreadystatechange=function()
	{
   
		if (xmlhttp.readyState==4 && xmlhttp.status==200)
		{
   
			document.getElementById("myDiv").innerHTML=xmlhttp.responseText;
		}
	}
	xmlhttp.open("GET","/try/ajax/demo_get.php",true);
	xmlhttp.send();
}
</script>

在这里插入图片描述

import requests
from urllib.parse import urlencode


base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
   
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/p/2304131618051664_-_WEIBO_SECOND_PROFILE_WEIBO',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

max_page = 10


def get_page(page):
    params = {
   
        'page_type': '03',
        'containerid': '2304131618051664_-_WEIBO_SECOND_PROFILE_WEIBO',
        'page': page
    }
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json(), page
    except requests.ConnectionError as e:
        print('Error', e.args)


def parse_page(json, page: int):
    print(json)
    if json:
        items = json.get('data').get('cards')
        for index, item in enumerate(items):
            if page == 1 and index == 1:
                continue
            else:
                item = item.get('mblog', {
   })
                weibo = {
   }
                weibo['id'] = item.get('id')
                weibo['title'] = item.get('page_info', {
   }).get('content1')
                weibo['content'] = item.get('page_info', {
   }).get('content2')
                yield weibo


if __name__ == '__main__':
    for page in range(1, max_page + 1):
        json = get_page(page)
        results = parse_page(*json)
        for result in results:
            print(result)

在这里插入图片描述

import requests
from urllib.parse import urlencode
from requests import codes
import os
from hashlib import md5
from multiprocessing.pool import Pool
import re


def get_page(offset):
    params = {
   
        'aid': '24',
        'offset': offset,
        'format': 'json',
        #'keyword': '街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from': 'search_tab',
        'pd': 'synthesis'
    }
    base_url = 'https://www.toutiao.com/api/search/content/?keyword=%E8%A1%97%E6%8B%8D'
    url = base_url + urlencode(params)
    try:
        resp = requests.get(url)
        print(url)
        if 200 == resp.status_code:
            print(resp.json())
            return resp.json()
    except requests.ConnectionError:
        return None


def get_images(json):
    if json.get('data'):
        data = json.get('data')
        for item in data:
            if item.get('cell_type') is not None:
                continue
            title = item.get('title')
            images = item.get('image_list')
            for image in images:
                origin_image = re.sub("list", "origin", image.get('url'))
                yield {
   
                    'image':  origin_image,
                    # 'iamge': image.get('url'),
                    'title': title
                }


print('succ')


def save_image(item):
    img_path = 'img' + os.path.sep + item.get('title')
    print('succ2')
    if not os.path.exists(img_path):
        os.makedirs(img_path)
    try:
        resp = requests.get(item.get('image')) # 根据图片的url得到图片
        if codes.ok == resp.status_code:
            file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
                file_name=md5(resp.content).hexdigest(),
                file_suffix='jpg')
            if not os.path.exists(file_path):
                print('succ3')
                with open(file_path, 'wb') as f:
                    f.write(resp.content)
                print('Downloaded image path is %s' % file_path)
                print('succ4')
            else:
                print('Already Downloaded', file_path)
    except requests.ConnectionError:
        print('Failed to Save Image，item %s' % item)


def main(offset):
    json = get_page(offset)
    for item in get_images(json):
        print(item)
        save_image(item)


GROUP_START = 0
GROUP_END = 7

if __name__ == '__main__':
    pool = Pool()
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    pool.map(main, groups) # 线程池开启
    pool.close()
    pool.join()

result：
在这里插入图片描述

动态渲染页面的爬取

splash的使用
安装：
sudo apt install docker
sudo vim /etc/docker/daemon.json
{
“registry-mirrors”: [
“http://hub-mirror.c.163.com”,
“https://registry.docker-cn.com/”
]
}
sudo docker pull scrapinghub/splash # 安装
docker run -p 8050:8050 scrapinghub/splash # 运行

Lua脚本的使用

 print("Hello World！")
-- test.lua 文件脚本
a = 5               -- 全局变量
local b = 5         -- 局部变量

function joke()
    c = 5           -- 全局变量
    local d = 6     -- 局部变量
end

joke()
print(c,d)          --> 5 nil

do 
    local a = 6     -- 局部变量
    b = 6           -- 对局部变量重新赋值
    print(a,b)

最低0.47元/天解锁文章

zylgbin

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python3爬虫

基本库的使用urlliburllib包含了四个模块：（1）request，http请求模块（2）error，异常处理模块（3）parse，url处理的工具模块（4）robotparser，识别网站的robots.txt文件1.requestfrom urllib import request, parse, errorimport socket'''request.urlope...
复制链接

扫一扫