Python3爬虫

1、requests函数

import requests

	target = 'https://www.booktxt.com/20_20244/714050.html'
    req = requests.get(url=target)
    req.encoding = 'GBK'
    html = req.text
    print(html )

2、BeautifulSoup函数

https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
具体章节下载

from bs4 import BeautifulSoup
import requests

    target = 'https://www.booktxt.com/20_20244/714050.html'
    req = requests.get(url=target)
    req.encoding = 'GBK'
    html = req.text
    bb = BeautifulSoup(html, "lxml")
    texts = bb.find_all('div', id='content')
    print(texts)
    print(texts[0].text.replace('    ', '\r\n'))

目录下载

def run():
    target = 'https://www.booktxt.com/20_20244/'
    req = requests.get(url=target)
    req.encoding = 'GBK'
    html = req.text
    bb = BeautifulSoup(html, "lxml")
    div = bb.find_all('div', id='list')
    a_bf = BeautifulSoup(str(div[0]), "lxml")
    a = a_bf.find_all('a')
    for each in a:
        print(each.string, target + each.get('href'))

合并下载

def run():
    target = 'https://www.booktxt.com/20_20244/'
    req = requests.get(url=target)
    req.encoding = 'GBK'
    html = req.text
    bb = BeautifulSoup(html, "lxml")
    div = bb.find_all('div', id='list')
    a_bf = BeautifulSoup(str(div[0]), "lxml")
    a = a_bf.find_all('a')
    for each in a:
        print(each.string, target + each.get('href'))
        writer(each.string, '我不想当老大.txt', get_contents(target + each.get('href')))


def get_contents(target):
    req = requests.get(url=target)
    req.encoding = 'GBK'
    html = req.text
    bb = BeautifulSoup(html, "lxml")
    texts = bb.find_all('div', id='content')
    return texts[0].text.replace('    ', '\r\n')


def writer(name, path, text):
    with open(path, 'a', encoding='utf-8') as f:
        f.write(name + '\n')
        f.writelines(text)
        f.write('\n\n')

在这里插入图片描述

3、爬取图片

import logging
import os
from pathlib import Path

from bs4 import BeautifulSoup
import requests

	target = 'http://www.nmc.cn/publish/satellite/FY4A-infrared.htm'
    req = requests.get(url=target)
    req.encoding = 'UTF-8'
    html = req.text
    bb = BeautifulSoup(html, "lxml")
    div = bb.find_all('div', id='timeWrap')
    a_bf = BeautifulSoup(str(div[0]), "lxml")
    a = a_bf.find_all('div', class_='col-xs-12')
    for each in a:
        img_http = each.get('data-img')
        name = img_http.replace('http://image.nmc.cn/product/', '').split('?')[0]
        r = requests.get(img_http)
        Path(f'D:/test/cloud/{name}').parent.mkdir(exist_ok=True, parents=True)
        with open(f'D:/test/cloud/{name}', 'wb') as f:
            f.write(r.content)

4、动态爬取Chromeless

替代以前的PhantomJS,NightmareJS或Selenium,他们能做的事几乎能做
Python的ChromeDriver
ChromeDriver下载
下载地址:

  1. http://npm.taobao.org/mirrors/chromedriver/(可用)
  2. chromedriver的版本要与你使用的chrome版本对应
    将chromedriver.exe放置在anaconda安装路径下的Scripts目录下,例如:E:\ProgramData\Miniconda3\envs\py37\Scripts

安装selenium库

conda install selenium

测试selenium库是否安装成功

#用 Chrome 浏览器来测试

from selenium import webdriver

browser = webdriver.Chrome()
browser.get('http://www.baidu.com/')


运行这段代码,会自动打开浏览器,然后访问百度。

5、爬取html中的图片(云图)

import requests
import re
_session = requests.Session()

    url = 'http://www.nmc.cn/publish/satellite/FY4A-true-color.htm'
    res = _session.get(url=url).text
    htp_list = re.findall(r'data-img="(.*?)" data-time=', res)
    os_path = os.getcwd() + '/cloud/'
    if not os.path.exists(os_path):
        os.mkdir(os_path)
    for htp in htp_list:
        cloud_name = re.findall(r'medium/(.*?)\?', htp)[0]
        print(cloud_name)
        with open(os_path + cloud_name, 'wb') as f:
            cloud_byte = _session.get(htp).content
            f.write(cloud_byte)

6、爬取json中的图片(雷达)

pip install demjson
pip install requests-html        
import re
from pathlib import Path

from requests_html import HTMLSession
import demjson
_session = HTMLSession()

def radar():
    #https://pi.weather.com.cn/i/product/pic/m/sevp_aoc_rdcp_sldas_ebref_achn_l88_pi_20201223063600001.png
    url = 'http://d1.weather.com.cn/radar/JC_RADAR_CHN_JB.html'
    text_res = _session.get(url=url).text
    json_str_res = re.findall(r'\((.*?)\)', text_res)[0]
    json_res = demjson.decode(json_str_res)
    radar_list = json_res['radars']
    os_path = os.getcwd() + '/zgtwq_radar/ebref_achn/'
    if not Path(os_path).exists():
        Path(os_path).mkdir(parents=True)
    for r in radar_list:
        htp = f"https://pi.weather.com.cn/i/product/pic/m/sevp_aoc_rdcp_sldas_{r['fn']}_l88_pi_{r['ft']}.png"
        with open(os_path + r['ft'] + '.png', 'wb') as f:
            cloud_byte = _session.get(htp).content
            f.write(cloud_byte)

参考:https://blog.csdn.net/u010591976/article/details/104166095

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值