写在前面
本文中涉及的所有python爬虫代码都是用来为渗透测试做准备的,各位uu们使用时请勿爬取违法内容。如若有人用做恶意爬虫违法与本人无关!!!
一.爬取网页首页
# 爬取搜狗首页
import requests
if __name__ == "__main__":
url = "https://www.sogou.com/" #指定url
response = requests.get(url) # 发起请求并保存在response中
pagedata = response.text # 持久化存储
# print(pagedata)
with open('./sougou.html','w',encoding='utf-8') as fp:
fp.write(pagedata)
print("爬取结束")
二.简易网页采集
# 简易网页采集器
import requests
# UA头伪装:将User-Agent封装到字典中
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
url = 'https://www.sogou.com/web'
# 处理url的参数,封装到字典中
kw = input("请输入搜索关键词")
param = {
'query':kw
}
response = requests.get(url=url,params=param,headers=headers)
pagedata = response.text
file_name = kw + '.html'
with open(file_name,'w',encoding = 'utf-8') as fp:
fp.write(pagedata)
print("爬取并保存成功!")
三.破解百度翻译
# 破解百度翻译
import requests
import json
if __name__ == "__main__":
post_url = 'https://fanyi.baidu.com/sug'
word = input("请输入word:")
data_dic = {
'kw':word
}
# UA伪装
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
response = requests.post(url=post_url,data=data_dic,headers=headers) #请求发送
# 获取响应数据,json格式
dic_obj = response.json()
file_name = word + ".json"
fp = open(file_name,'w',encoding='utf-8')
json.dump(dic_obj,fp=fp,ensure_ascii=False)
print("翻译成功!")
四.爬取豆瓣电影分类排行榜中电影详情数据
# 爬取豆瓣电影分类排行榜中的电影详情数据
import requests
import json
if __name__ == "__main__":
url = 'https://movie.douban.com/j/chart/top_list'
param = {
'type': '24',
'interval_id': '100:90',
'action':'',
'start': '40', # 从多少位开始
'limit': '20' # 每页有多少部电影
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
response = requests.get(url=url,params=param,headers=headers)
list_moviedata = response.json()
fp = open('./douban.json','w',encoding='utf-8')
json.dump(list_moviedata,fp=fp,ensure_ascii=False)
print("over!!!")
五.爬取全国肯德基餐厅位置信息
我这里加了一点创新,就是你可以用for循环动态控制你要爬取多少页的数据
#爬取肯德基餐厅位置信息
import requests
import json
if __name__ == "__main__":
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
city = input("请输入城市")
kfc_datas = ''
for page in range(1,10):
param = {
'cname':'' ,
'pid': '',
'keyword': city,
'pageIndex': page,
'pageSize': '10'
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
response = requests.post(url=url,params=param,headers=headers)
kfc_data = response.text
kfc_datas += kfc_data
file_name = city + '.txt'
print(type(kfc_datas))
with open(file_name,'w',encoding='utf-8') as fp:
fp.write(kfc_datas)
print("over!!!")
六.正则匹配爬取图片网站中某一类型的所有图片
# 爬取图片网站中某一类型的该网页的所有图片
import requests
import re
import os
if __name__ == "__main__":
if not os.path.exists('./imgdir'): # 创建一个文件夹用来保存所有图片
os.mkdir('./imgdir')
url = 'https://www.tooopen.com/img/87_312.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
web_data = requests.get(url=url,headers=headers).text
ex = '<a class="pic".*?<img src="(.*?)" alt.*?</a>' #正则表达式匹配获取图片url
img_src_list = re.findall(ex,web_data,re.S)
for imghtml in img_src_list: #遍历爬取到的图片url列表,并获取响应数据
img_data = requests.get(imghtml).content
img_name = imghtml.split('/')[-1] # 指定图片名称
imgpath = './imgdir/' + img_name # 指定图片存储路径
with open(imgpath,'wb') as fp:
fp.write(img_data)
print(img_name,'下载成功!!!')
七.bs4爬取三国演义小说各章节标题及内容
这个没写完,因为原来那个网站的内容都乱了,爬下来都是其他的东西,但方法都是一样的。重在理解方法。
# 爬取三国演义小说各章节标题以及各章节内容
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
pagedata = requests.get(url=url,headers=headers)
pagedata.encoding = 'utf-8' # 这里先转换一下编码,不然爬下来的数据是乱码
pagehtml = pagedata.text
soup = BeautifulSoup(pagehtml,'lxml') # 实例化BeautifulSoup对象
li_list = soup.select('.book-mulu > ul > li')
for li in li_list:
title = li.a.string # 获取章节标题
titel_url = 'https://www.shicimingju.com' + li.a['href'] # 获取各章节的url
requests.get(url=title_url,headers=headers)
八.xpath 爬取网站中同一类型图片
这里的重点是用xpath的属性定位获取到图片下载地址,图片名称。然后中文名乱码问题的解决方式,要么你在获取请求数据后就修改编码格式,要么你就在命名时修改编码格式。
response = requests.get(url=url, headers=headers)
response.encoding = 'gbk'
# 通用处理中文乱码的解决方案
img_name = img_name.encode('iso-8859-1').decode('gbk')
# xpath 爬取图片网站中的高清图片
import requests
from lxml import etree
import os
if __name__ == "__main__":
url = 'https://pic.netbian.com/new/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
response = requests.get(url=url,headers=headers)
response.encoding = 'gbk'
page_data = response.text
tree = etree.HTML(page_data)
li_list = tree.xpath('//div[@class="slist"]//li')
if not os.path.exists('./pics'): # 创建文件夹存放图片
os.mkdir('./pics')
for li in li_list:
img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0] # 获取图片下载地址
img_name = li.xpath('./a/img/@alt')[0] + '.jpg' #图片命名
# img_name = img_name.encode('iso-8859-1').decode('gbk') # 解决中文名乱码问题
img_data = requests.get(url=img_src,headers=headers).content
img_path = './pics/' + img_name
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name,'下载成功!!!')
九.xpath爬取全国城市的名称
这里说明下,热门城市也包含在全部城市里,把它取出来只是为了熟练xpath使用
# xpath 爬取所有城市的名称
import requests
from lxml import etree
import numpy as np
if __name__ == "__main__":
url = 'https://www.aqistudy.cn/historydata/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
hot_city_list = tree.xpath('//div[@class="bottom"]/ul/li/a/text()') # 取到热门城市的名称
city_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()') # 取全部城市的名称
print(city_list)
# 也可以一步取到热门城市和全部城市的名称
# city_list_name = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
# print(city_list_name)
十.爬取站长素材中免费的简历模板
# 爬取站长素材中免费的简历模板
import requests
import os
from lxml import etree
if __name__ == "__main__":
url = 'https://sc.chinaz.com/jianli/free.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
if not os.path.exists('./resume'):
os.mkdir('./resume')
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
resume_html = tree.xpath('//div[@id="container"]/div/a/@href') #获取指定简历下载的网页地址
for html in resume_html:
rzhuanma = requests.get(url=html,headers=headers)
rzhuanma.encoding = 'utf-8'
resume_download = rzhuanma.text
rtree = etree.HTML(resume_download)
download = rtree.xpath('//div[@class="clearfix mt20 downlist"]/ul[@class="clearfix"]/li[1]/a/@href') # 获取下载链接
download_url = "".join(download) # 列表转字符串,便于直接请求下载
file = requests.get(url=download_url,headers=headers).content # 下载简历
file_name = rtree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')
file_namep = "".join(file_name) + '.rar'
file_path = './resume/' + file_namep
with open(file_path,'wb') as fp:
fp.write(file)
print(file_namep,"下载成功!!!")
十一.利用超级鹰平台进行验证码识别
# 超级鹰验证码识别
import requests
from lxml import etree
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == "__main__":
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
check = tree.xpath('//img[@id="imgCode"]/@src')
check_url = "https://so.gushiwen.cn" + "".join(check)
check_img_path = './b.jpg'
check_img_data = requests.get(url=check_url,headers=headers).content
with open(check_img_path,'wb') as fp:
fp.write(check_img_data)
# 调用代码识别平台代码来实现验证码识别
chaojiying = Chaojiying_Client('idontcare', '1234567890', ' 950674') # 用户中心>>软件ID 生成一个替换 96001
im = open('b.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print (chaojiying.PostPic(im, 1004)['pic_str']) # 1004 代表验证码类型
十二.爬取目标url并自动进行waf检测。
大的来辣!!!爬虫与脚本的结合
# 通过搜索引擎批量获取edu.cn网站,并检测是否存在WAF,将检测结果放在文件中便于后期渗透测试的进行
import requests
from lxml import etree
import subprocess
import os
import re
if __name__ == "__main__":
def run_command(command):
os.chdir("C:/Users/17398\PycharmProjects/untitled1\.idea\wafw00f-master\wafw00f") # 保证在main.py所在绝对路径打开命令行窗口
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
return output, error
url = 'https://www.google.com/search?q=inurl:edu.cn&gbv=2&sxsrf=AB5stBjMKJX8iPy1HwsLjpYFSgahd3kJpA:1689824667555&ei=m624ZNG-IYnWkPIP8PCJyAw&start=0&sa=N&ved=2ahUKEwiR-Kq4r5yAAxUJK0QIHXB4Ask4ChDy0wN6BAgDEAQ&biw=767&bih=736&dpr=1.25'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
edu_html = tree.xpath('//div[@class="yuRUbf"]/a/@href') # 第一页的目标url
page_sum = tree.xpath('//tr[@jsname="TeSSVd"]/td/a/@href') # 第2~10页对应的url
all_list = [] #定义一个空列表用来存放后面所有目标url
all_list.extend(edu_html) # 把第一页目标url存入all_list中
for page_html in page_sum:
page_htmls = "https://www.google.com" + "".join(page_html)
else_data = requests.get(url=page_htmls,headers=headers).text
else_tree = etree.HTML(else_data)
else_html = else_tree.xpath('//div[@class="yuRUbf"]/a/@href') #获取2~10页的目标url
all_list.extend(else_html) # 存入all_list
print(all_list)
for edu_htmls in all_list:
commands = "main.py " + edu_htmls #测试是否存在wafw00f
output,error = run_command(commands)
true_str = "seems to be behind a WAF" #根据返回内容判断
outputs = output.decode('utf-8')
errors = error.decode('utf-8')
if outputs.count(true_str) != 0:
print("have waf!")
with open("./havewaf.txt",'a') as fp: # 写入对应文件中
fp.write(edu_htmls + '\n')
elif outputs.count(true_str) == 0 and errors == "":
print("no waf")
with open("./nowaf.txt",'a') as fp:
fp.write(edu_htmls + '\n')
else:
print("connect error!")
with open("./connecterror.txt",'a') as fp:
fp.write(edu_htmls + '\n')