1、requests函数
import requests
target = 'https://www.booktxt.com/20_20244/714050.html'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
print(html )
2、BeautifulSoup函数
https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
具体章节下载
from bs4 import BeautifulSoup
import requests
target = 'https://www.booktxt.com/20_20244/714050.html'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
texts = bb.find_all('div', id='content')
print(texts)
print(texts[0].text.replace(' ', '\r\n'))
目录下载
def run():
target = 'https://www.booktxt.com/20_20244/'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
div = bb.find_all('div', id='list')
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
for each in a:
print(each.string, target + each.get('href'))
合并下载
def run():
target = 'https://www.booktxt.com/20_20244/'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
div = bb.find_all('div', id='list')
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
for each in a:
print(each.string, target + each.get('href'))
writer(each.string, '我不想当老大.txt', get_contents(target + each.get('href')))
def get_contents(target):
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
texts = bb.find_all('div', id='content')
return texts[0].text.replace(' ', '\r\n')
def writer(name, path, text):
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
3、爬取图片
import logging
import os
from pathlib import Path
from bs4 import BeautifulSoup
import requests
target = 'http://www.nmc.cn/publish/satellite/FY4A-infrared.htm'
req = requests.get(url=target)
req.encoding = 'UTF-8'
html = req.text
bb = BeautifulSoup(html, "lxml")
div = bb.find_all('div', id='timeWrap')
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('div', class_='col-xs-12')
for each in a:
img_http = each.get('data-img')
name = img_http.replace('http://image.nmc.cn/product/', '').split('?')[0]
r = requests.get(img_http)
Path(f'D:/test/cloud/{name}').parent.mkdir(exist_ok=True, parents=True)
with open(f'D:/test/cloud/{name}', 'wb') as f:
f.write(r.content)
4、动态爬取Chromeless
替代以前的PhantomJS,NightmareJS或Selenium,他们能做的事几乎能做
Python的ChromeDriver
ChromeDriver下载
下载地址:
- http://npm.taobao.org/mirrors/chromedriver/(可用)
- chromedriver的版本要与你使用的chrome版本对应
将chromedriver.exe放置在anaconda安装路径下的Scripts目录下,例如:E:\ProgramData\Miniconda3\envs\py37\Scripts
安装selenium库
conda install selenium
测试selenium库是否安装成功
#用 Chrome 浏览器来测试
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('http://www.baidu.com/')
运行这段代码,会自动打开浏览器,然后访问百度。
5、爬取html中的图片(云图)
import requests
import re
_session = requests.Session()
url = 'http://www.nmc.cn/publish/satellite/FY4A-true-color.htm'
res = _session.get(url=url).text
htp_list = re.findall(r'data-img="(.*?)" data-time=', res)
os_path = os.getcwd() + '/cloud/'
if not os.path.exists(os_path):
os.mkdir(os_path)
for htp in htp_list:
cloud_name = re.findall(r'medium/(.*?)\?', htp)[0]
print(cloud_name)
with open(os_path + cloud_name, 'wb') as f:
cloud_byte = _session.get(htp).content
f.write(cloud_byte)
6、爬取json中的图片(雷达)
pip install demjson
pip install requests-html
import re
from pathlib import Path
from requests_html import HTMLSession
import demjson
_session = HTMLSession()
def radar():
#https://pi.weather.com.cn/i/product/pic/m/sevp_aoc_rdcp_sldas_ebref_achn_l88_pi_20201223063600001.png
url = 'http://d1.weather.com.cn/radar/JC_RADAR_CHN_JB.html'
text_res = _session.get(url=url).text
json_str_res = re.findall(r'\((.*?)\)', text_res)[0]
json_res = demjson.decode(json_str_res)
radar_list = json_res['radars']
os_path = os.getcwd() + '/zgtwq_radar/ebref_achn/'
if not Path(os_path).exists():
Path(os_path).mkdir(parents=True)
for r in radar_list:
htp = f"https://pi.weather.com.cn/i/product/pic/m/sevp_aoc_rdcp_sldas_{r['fn']}_l88_pi_{r['ft']}.png"
with open(os_path + r['ft'] + '.png', 'wb') as f:
cloud_byte = _session.get(htp).content
f.write(cloud_byte)
参考:https://blog.csdn.net/u010591976/article/details/104166095