python爬虫学习之路

写在前面

本文中涉及的所有python爬虫代码都是用来为渗透测试做准备的,各位uu们使用时请勿爬取违法内容。如若有人用做恶意爬虫违法与本人无关!!!

一.爬取网页首页

# 爬取搜狗首页
import requests

if __name__ == "__main__":
    url = "https://www.sogou.com/" #指定url
    response = requests.get(url) # 发起请求并保存在response中
    pagedata = response.text # 持久化存储
    # print(pagedata)
    with open('./sougou.html','w',encoding='utf-8') as fp:
        fp.write(pagedata)
    print("爬取结束")

二.简易网页采集

# 简易网页采集器
import requests
    # UA头伪装:将User-Agent封装到字典中
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
    }
url = 'https://www.sogou.com/web'
    # 处理url的参数,封装到字典中
kw = input("请输入搜索关键词")
param = {
        'query':kw
    }
response = requests.get(url=url,params=param,headers=headers)
pagedata = response.text
file_name = kw + '.html'
with open(file_name,'w',encoding = 'utf-8') as fp:
    fp.write(pagedata)
print("爬取并保存成功!")

三.破解百度翻译

# 破解百度翻译
import  requests
import  json
if __name__ == "__main__":
    post_url = 'https://fanyi.baidu.com/sug'
    word = input("请输入word:")
    data_dic = {
        'kw':word
    }
    # UA伪装
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
    }
    response = requests.post(url=post_url,data=data_dic,headers=headers) #请求发送
    # 获取响应数据,json格式
    dic_obj = response.json()
    file_name = word + ".json"
    fp = open(file_name,'w',encoding='utf-8')
    json.dump(dic_obj,fp=fp,ensure_ascii=False)
    print("翻译成功!")

四.爬取豆瓣电影分类排行榜中电影详情数据

# 爬取豆瓣电影分类排行榜中的电影详情数据
import  requests
import  json

if __name__ == "__main__":
    url = 'https://movie.douban.com/j/chart/top_list'
    param = {
        'type': '24',
        'interval_id': '100:90',
        'action':'',
        'start': '40', # 从多少位开始
        'limit': '20' # 每页有多少部电影
    }
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
    }
    response = requests.get(url=url,params=param,headers=headers)

    list_moviedata = response.json()
    fp = open('./douban.json','w',encoding='utf-8')
    json.dump(list_moviedata,fp=fp,ensure_ascii=False)
    print("over!!!")

五.爬取全国肯德基餐厅位置信息
我这里加了一点创新,就是你可以用for循环动态控制你要爬取多少页的数据

#爬取肯德基餐厅位置信息
import  requests
import  json

if __name__ == "__main__":

    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    city = input("请输入城市")
    kfc_datas = ''
    for page in range(1,10):
        param = {
              'cname':'' ,
              'pid': '',
              'keyword': city,
              'pageIndex': page,
              'pageSize': '10'
        }
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
            }
        response = requests.post(url=url,params=param,headers=headers)
        kfc_data = response.text
        kfc_datas += kfc_data
    file_name = city + '.txt'
    print(type(kfc_datas))
    with open(file_name,'w',encoding='utf-8') as fp:
        fp.write(kfc_datas)
    print("over!!!")

六.正则匹配爬取图片网站中某一类型的所有图片

# 爬取图片网站中某一类型的该网页的所有图片

import requests
import re
import os

if __name__ == "__main__":
    if not os.path.exists('./imgdir'): # 创建一个文件夹用来保存所有图片
        os.mkdir('./imgdir')

    url = 'https://www.tooopen.com/img/87_312.html'
    headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
            }
    web_data = requests.get(url=url,headers=headers).text
    ex = '<a class="pic".*?<img src="(.*?)" alt.*?</a>' #正则表达式匹配获取图片url
    img_src_list = re.findall(ex,web_data,re.S)
    for imghtml in img_src_list:  #遍历爬取到的图片url列表,并获取响应数据
        img_data = requests.get(imghtml).content
        img_name = imghtml.split('/')[-1] # 指定图片名称
        imgpath = './imgdir/' + img_name # 指定图片存储路径
        with open(imgpath,'wb') as fp:
            fp.write(img_data)
            print(img_name,'下载成功!!!')


七.bs4爬取三国演义小说各章节标题及内容
这个没写完,因为原来那个网站的内容都乱了,爬下来都是其他的东西,但方法都是一样的。重在理解方法。

# 爬取三国演义小说各章节标题以及各章节内容
import requests
from bs4 import BeautifulSoup

if __name__ == "__main__":
    url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
    headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
            }
    pagedata = requests.get(url=url,headers=headers)
    pagedata.encoding = 'utf-8' # 这里先转换一下编码,不然爬下来的数据是乱码
    pagehtml = pagedata.text
    soup = BeautifulSoup(pagehtml,'lxml') # 实例化BeautifulSoup对象
    li_list = soup.select('.book-mulu > ul > li')
    for li in li_list:
        title = li.a.string # 获取章节标题
        titel_url = 'https://www.shicimingju.com' + li.a['href'] # 获取各章节的url
        requests.get(url=title_url,headers=headers)

八.xpath 爬取网站中同一类型图片
这里的重点是用xpath的属性定位获取到图片下载地址,图片名称。然后中文名乱码问题的解决方式,要么你在获取请求数据后就修改编码格式,要么你就在命名时修改编码格式。

    response = requests.get(url=url, headers=headers)
    response.encoding = 'gbk'


# 通用处理中文乱码的解决方案
 img_name = img_name.encode('iso-8859-1').decode('gbk')
# xpath 爬取图片网站中的高清图片
import requests
from lxml import etree
import os

if __name__ == "__main__":
    url = 'https://pic.netbian.com/new/'
    headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
            }
    response = requests.get(url=url,headers=headers)
    response.encoding = 'gbk'
    page_data = response.text
    tree = etree.HTML(page_data)
    li_list = tree.xpath('//div[@class="slist"]//li')
    if not os.path.exists('./pics'): # 创建文件夹存放图片
        os.mkdir('./pics')
    for li in li_list:
        img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0] # 获取图片下载地址
        img_name = li.xpath('./a/img/@alt')[0] + '.jpg' #图片命名
        # img_name = img_name.encode('iso-8859-1').decode('gbk') # 解决中文名乱码问题
        img_data = requests.get(url=img_src,headers=headers).content
        img_path = './pics/' + img_name
        with open(img_path,'wb') as fp:
            fp.write(img_data)
        print(img_name,'下载成功!!!')

九.xpath爬取全国城市的名称
这里说明下,热门城市也包含在全部城市里,把它取出来只是为了熟练xpath使用

# xpath 爬取所有城市的名称

import requests
from lxml import etree
import numpy as np

if __name__ == "__main__":
    url = 'https://www.aqistudy.cn/historydata/'
    headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
            }
    page_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)
    hot_city_list = tree.xpath('//div[@class="bottom"]/ul/li/a/text()') # 取到热门城市的名称
    city_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()') # 取全部城市的名称
    print(city_list)

    # 也可以一步取到热门城市和全部城市的名称
    # city_list_name = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
    # print(city_list_name)

十.爬取站长素材中免费的简历模板

# 爬取站长素材中免费的简历模板

import requests
import os
from lxml import etree

if __name__ == "__main__":
    url = 'https://sc.chinaz.com/jianli/free.html'
    headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
            }
    if not os.path.exists('./resume'):
        os.mkdir('./resume')
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    page_text = response.text
    tree = etree.HTML(page_text)
    resume_html = tree.xpath('//div[@id="container"]/div/a/@href') #获取指定简历下载的网页地址
    for html in resume_html:
        rzhuanma = requests.get(url=html,headers=headers)
        rzhuanma.encoding = 'utf-8'
        resume_download = rzhuanma.text
        rtree = etree.HTML(resume_download)
        download = rtree.xpath('//div[@class="clearfix mt20 downlist"]/ul[@class="clearfix"]/li[1]/a/@href') # 获取下载链接
        download_url = "".join(download) # 列表转字符串,便于直接请求下载
        file = requests.get(url=download_url,headers=headers).content # 下载简历
        file_name = rtree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')
        file_namep = "".join(file_name) + '.rar'
        file_path = './resume/' + file_namep
        with open(file_path,'wb') as fp:
            fp.write(file)
        print(file_namep,"下载成功!!!")

十一.利用超级鹰平台进行验证码识别

# 超级鹰验证码识别
import  requests
from lxml import etree
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()


    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

if __name__ == "__main__":
    url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
    headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }
    page_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)
    check = tree.xpath('//img[@id="imgCode"]/@src')
    check_url = "https://so.gushiwen.cn" + "".join(check)
    check_img_path = './b.jpg'
    check_img_data = requests.get(url=check_url,headers=headers).content
    with open(check_img_path,'wb') as fp:
        fp.write(check_img_data)

    # 调用代码识别平台代码来实现验证码识别
    chaojiying = Chaojiying_Client('idontcare', '1234567890', '	950674')	# 用户中心>>软件ID 生成一个替换 96001
    im = open('b.jpg', 'rb').read()											# 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print (chaojiying.PostPic(im, 1004)['pic_str'])                         # 1004 代表验证码类型

十二.爬取目标url并自动进行waf检测。
大的来辣!!!爬虫与脚本的结合

# 通过搜索引擎批量获取edu.cn网站,并检测是否存在WAF,将检测结果放在文件中便于后期渗透测试的进行
import requests
from lxml import etree
import subprocess
import os
import re

if __name__ == "__main__":
    def run_command(command):
        os.chdir("C:/Users/17398\PycharmProjects/untitled1\.idea\wafw00f-master\wafw00f") # 保证在main.py所在绝对路径打开命令行窗口
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        output, error = process.communicate()
        return output, error

    url = 'https://www.google.com/search?q=inurl:edu.cn&gbv=2&sxsrf=AB5stBjMKJX8iPy1HwsLjpYFSgahd3kJpA:1689824667555&ei=m624ZNG-IYnWkPIP8PCJyAw&start=0&sa=N&ved=2ahUKEwiR-Kq4r5yAAxUJK0QIHXB4Ask4ChDy0wN6BAgDEAQ&biw=767&bih=736&dpr=1.25'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }
    page_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)
    edu_html = tree.xpath('//div[@class="yuRUbf"]/a/@href') # 第一页的目标url
    page_sum = tree.xpath('//tr[@jsname="TeSSVd"]/td/a/@href') # 第2~10页对应的url
    all_list = [] #定义一个空列表用来存放后面所有目标url
    all_list.extend(edu_html) # 把第一页目标url存入all_list中
    for page_html in page_sum:
        page_htmls = "https://www.google.com" + "".join(page_html)
        else_data = requests.get(url=page_htmls,headers=headers).text
        else_tree = etree.HTML(else_data)
        else_html = else_tree.xpath('//div[@class="yuRUbf"]/a/@href') #获取2~10页的目标url
        all_list.extend(else_html) # 存入all_list
    print(all_list)
    for edu_htmls in all_list:
        commands = "main.py " + edu_htmls #测试是否存在wafw00f
        output,error = run_command(commands)
        true_str = "seems to be behind a WAF" #根据返回内容判断
        outputs = output.decode('utf-8')
        errors = error.decode('utf-8')
        if outputs.count(true_str) != 0:
            print("have waf!")
            with open("./havewaf.txt",'a') as fp:  # 写入对应文件中
                fp.write(edu_htmls + '\n')
        elif outputs.count(true_str) == 0 and errors == "":
            print("no waf")
            with open("./nowaf.txt",'a') as fp:
                fp.write(edu_htmls + '\n')
        else:
            print("connect error!")
            with open("./connecterror.txt",'a') as fp:
                fp.write(edu_htmls + '\n')


评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值