参考文档说明
'''
import requests
from bs4 import BeautifulSoup
#请求网页豆瓣爬取成都豆瓣xxx
url = "https://music.douban.com/"
# 伪装成浏览器的header
fake_headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
response = requests.get(url, headers=fake_headers)
with open("SAA.html",'w',encoding='utf-8') as f:
f.write(response.content.decode('utf-8'))
f.close()
# 保存网页到本地# 保存网页到本地# 保存网页到本地# 保存网页到本地
# 保存网页到本地# 保存网页到本地# 保存网页到本地# 保存网页到本地
# 保存网页到本地# 保存网页到本地# 保存网页到本地# 保存网页到本地
# 保存网页到本地# 保存网页到本地# 保存网页到本地# 保存网页到本地
PC端:
safari 5.1 – MAC
User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50
safari 5.1 – Windows
User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50
IE 9.0
User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;
IE 8.0
User-Agent:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)
IE 7.0
User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
IE 6.0
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
Firefox 4.0.1 – MAC
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
Firefox 4.0.1 – Windows
User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
Opera 11.11 – MAC
User-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11
Opera 11.11 – Windows
User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11
Chrome 17.0 – MAC
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
傲游(Maxthon)
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)
腾讯TT
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)
世界之窗(The World) 2.x
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)
世界之窗(The World) 3.x
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)
搜狗浏览器 1.x
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)
360浏览器
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)
Avant
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)
Green Browser
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)
作者:大牧莫邪\\\\源于简书
链接:https://www.jianshu.com/p/da6a44d0791e
/
Opera
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
Opera/8.0 (Windows NT 5.1; U; en)
Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
Firefox
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Safari
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
chrome
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16
360
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
淘宝浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11
猎豹浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
QQ浏览器
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
sogou浏览器
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
maxthon浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
UC浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
CSDN: https://blog.csdn.net/Jamesaonier/article/details/89003053
# file_obj = open('SAA.html', 'w')
# file_obj.write(response.content.decode('utf-8'))
# file_obj.close()
# 解析网页
# 初始化BeautifulSoup方法:利用网页字符串自带的编码信息解析网页
# soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
# all_movies = soup.find('div', id="showing-soon")
# for each_movie in all_movies.find_all('div', class_="item"):
# all_a_tag = each_movie.find_all('a')
# all_li_tag = each_movie.find_all('li')
# movie_name = all_a_tag[1].text
# moive_href = all_a_tag[1]['href']
# movie_date = all_li_tag[0].text
# movie_type = all_li_tag[1].text
# movie_area = all_li_tag[2].text
# movie_lovers = all_li_tag[3].text
# print('名字:{},链接:{},日期:{},类型:{},地区:{}, 关注者:{}'.format(
# movie_name, moive_href, movie_date, movie_type, movie_area, movie_lovers))
'''
002案例:
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import urllib
import requests
from lxml import html
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# url = "http://yjsxt.shmtu.edu.cn/static/views/student-content.html?url=/static/product/py/paikegl/ckkb/xs_ckkblist.html?_mkbm=xs_ckkb"
#
#
# fake_headers = {
# 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
# }
#
# response = requests.get(url, headers=fake_headers)# 保存网页到本地
#
# with open("PPA33555.html",'w',encoding='utf-8') as f:
# f.write(response.content.decode('utf-8'))
# f.close()
# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
os.mkdir('ccccxvxvvx')#第一次运行新建ccccxvxvvx文件夹,手动建可以注释掉
for page in range(1,852):
url='http://www.mmonly.cc/mmtp/list_9_%s.html'%page
print(url)
response=requests.get(url,verify=False).text
selector=html.fromstring(response)
imgEle=selector.xpath('//div[@class="ABox"]/a')
print(len(imgEle))
for index,img in enumerate(imgEle):
imgUrl=img.xpath('@href')[0]
response=requests.get(imgUrl,verify=False).text
selector = html.fromstring(response)
pageEle = selector.xpath('//div[@class="wrapper clearfix imgtitle"]/h1/span/span[2]/text()')[0]
print(pageEle)
imgE=selector.xpath('//a[@class="down-btn"]/@href')[0]
imgName = '%s_%s_1.jpg' % (page,str(index+1))
coverPath = '%s/ccccxvxvvx/%s' % (os.getcwd(), imgName)
urllib.request.urlretrieve(imgE, coverPath)
for page_2 in range(2,int(pageEle)+1):
url=imgUrl.replace('.html', '_%s.html' % str(page_2))
response = requests.get(url).text
selector = html.fromstring(response)
imgEle = selector.xpath('//a[@class="down-btn"]/@href')[0]
print(imgEle)
imgName='%s_%s_%s.jpg'%(page,str(index+1),page_2)
coverPath = '%s/ccccxvxvvx/%s' % (os.getcwd(), imgName)
urllib.request.urlretrieve(imgEle, coverPath)
time.sleep(2)
运行结果如下:
http://www.mmonly.cc/mmtp/list_9_1.html
24
10
https://t1.huishahe.com/uploads/tu/202105/9999/22bb4d603c.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/94d1cbaacf.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/523f9c9322.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/aa9283235c.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/fd63f891a4.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/1f6df3a2ab.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/26bcdfc79f.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/1a00c5c43f.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/875f4cadfa.jpg
14
https://t1.huishahe.com/uploads/tu/202105/9999/40fdab5062.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/4925afc3d8.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/e8c6af7505.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/1c60c2fbe3.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/23e3f53f8b.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/63ac2131b9.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/d22ecfa5c5.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/916304e7ee.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/d6490d93df.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/72fd16b753.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/df0236cfc6.jpg
https://t1.huishahe.com/uploads/tu/202105/9999/71e53775dd.jpg
…省略
有bug的一个程序,,没改出来/
# _*_coding:utf-8_*_
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import urllib
import requests
from lxml import html
import time
class GetImage(object):
def __init__(self, url):
self.url = url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
self.dir_path = os.path.dirname(os.path.abspath(__file__))
self.path = self.dir_path + '/abcdd'
isExists = os.path.exists(self.dir_path + '/abcdd')
# 创建目录
if not isExists:
os.makedirs(self.path)
def download(self, url):
try:
res = requests.get(url, headers=self.headers)
return res
except Exception as E:
print(url + '下载失败,原因:' + E)
def parse(self, res):
content = res.content.decode()
# print(content)
img_list = re.findall(r'<img.*?src="(.*?)"', content, re.S)
img_list = ['http://www.yangqq.com/skin/jxhx/' + url for url in img_list]
return img_list
def save(self, res_img, file_name):
if res_img:
with open(file_name, 'wb') as f:
f.write(res_img.content)
print(url + '下载成功')
def run(self):
# 下载
res = self.download(self.url)
# 解析
url_list = self.parse(res)
# 下载图片
for url in url_list:
res_img = self.download(url)
name = url.strip().split('/').pop()
file_name = self.path + '/' + name
# 保存
self.save(res_img, file_name)
if __name__ == '__main__':
url_list = ['https://www.yangqq.com/skin/jxhx/', 'https://www.yangqq.com/skin/jxhx/list.html',
'https://www.yangqq.com/skin/jxhx/share.html', 'https://www.yangqq.com/skin/jxhx/list2.html',
'https://www.yangqq.com/skin/jxhx/list3.html', 'https://www.yangqq.com/skin/jxhx/daohang.html',
'https://www.yangqq.com/skin/jxhx/about.html']
for url in url_list:
text = GetImage(url)
text.run()
好用的004
import requests
from bs4 import BeautifulSoup
#请求网页豆瓣爬取成都xxx
url = "https://movie.douban.com/cinema/later/chengdu/"
# 伪装成浏览器的header
fake_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
response = requests.get(url, headers=fake_headers)
# 保存网页到本地
file_obj = open('douBB.html', 'w')
file_obj.write(response.content.decode('utf-8'))
file_obj.close()
# 解析网页
# 初始化BeautifulSoup方法:利用网页字符串自带的编码信息解析网页
soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
all_movies = soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_name = all_a_tag[1].text
moive_href = all_a_tag[1]['href']
movie_date = all_li_tag[0].text
movie_type = all_li_tag[1].text
movie_area = all_li_tag[2].text
movie_lovers = all_li_tag[3].text
print('名字:{},链接:{},日期:{},类型:{},地区:{}, 关注者:{}'.format(
movie_name, moive_href, movie_date, movie_type, movie_area, movie_lovers))