python爬虫

sed -i -e 's/\r$//'  install_python.sh  windwos编码转linux

sudo !! 获取root权限

阿里云的rpm
http://mirrors.aliyun.com/centos/7/os/x86_64/Packages/
缓存rpm文件解决依赖  下载rpm文件离线安装
https://www.cnblogs.com/nmap/p/9511848.html

sudo rpm -Uvh ./gcc/*.rpm --nodeps --force   强制安装

sudo !! root权限
./configure --prefix=/usr 指定路径
make Makefile编译 gcc
make install 安装  make uninstall 是卸载
CC是gcc的连接,gcc是编译器 gcc包含很多编译器(C, C++, Objective-C, Ada, Fortran,and   Java)

第一版本

#!/bin/sh
#!/bin/bash
#用python3.8.2  windwos脚本和linux脚本编码不一样
echo "定义参数"
python_path="/opt/python/"
sh_path="/opt/nssa/"

sudo rpm -Uvh ./python/rpm/*.rpm --nodeps --force

if [ -d $python_path ]; then
    sudo rm -rf $python_path
fi
sudo tar -xvf ./python/Python-v3.8.2.tgz
sudo mv Python-3.8.2 /opt
sudo mv /opt/Python-3.8.2 /opt/python

sudo cp -r ./python/packages ${python_path}
sudo cp ./python/requirements.txt ${python_path}

if [ -d /opt/python3 ]; then
    sudo rm -rf /opt/python3
fi

#中间临时文件加权限
sudo mkdir /opt/python3
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*

cd $python_path
sudo ./configure --prefix=/opt/python3
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*
sudo make 
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*
sudo make install

#软连接
sudo rm -rf /usr/bin/python3
sudo rm -rf /usr/bin/pip3 
sudo ln -s /opt/python3/bin/python3.8 /usr/bin/python3
sudo ln -s /opt/python3/bin/pip3.8 /usr/bin/pip3
#安装第三方库
sudo pip3 install --no-index --find-links=$python_path/packages -r $python_path/requirements.txt
cd $sh_path
sudo python3 CNVD.py >/dev/null 2>&1 &

第二版本

#!/bin/sh
#!/bin/bash
#用python3.8.2  windwos脚本和linux脚本编码不一样
echo "定义参数"
python_path="/opt/python/"
sh_path="/opt/nssa/"

if [ -d $python_path ]; then
    sudo rm -rf $python_path
fi

sudo tar -xvf ./python/Python-v3.8.2.tgz
sudo mv Python-3.8.2 /opt
sudo mv /opt/Python-3.8.2 /opt/python

sudo cp -r ./python/packages ${python_path}
sudo cp ./python/requirements.txt ${python_path}

cd $python_path
./configure --prefix=/usr/local/python3
sudo make && make install
#软连接
sudo rm -rf /usr/bin/python3
sudo rm -rf /usr/bin/pip3 
sudo ln -s /usr/local/python3/bin/python3.8 /usr/bin/python3
sudo ln -s /usr/local/python3/bin/pip3.8 /usr/bin/pip3
#安装第三方库 pip3 list查看第三方   pip3 show requests版本和安装位置  
#离线 pip3 freeze > requirements.txt   离线安装 pip3 install --no-index --find-links=DIR -r requirements.txt
#打包也需要切源pip3 download -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -d DIR -r requirements.txt
#sudo pip3 install requests -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install beautifulsoup4 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install lxml  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install pymysql  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
sudo pip3 install --no-index --find-links=${python_path}packages -r ${python_path}requirements.txt
cd $sh_path
sudo python3 CNVD.py


import requests
from bs4 import BeautifulSoup
import time
import pymysql
#mysql配置
hosts='XXXX'
ports=XXXX
users='XXXX'
passwds='XXXX'
dbs='XXXX'

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
ur = ['29','32','28','27','30','31']
#国家信息安全漏洞共享平台
urli = "https://www.cnvd.org.cn/flaw/typeResult?typeId=";
urlii = "https://www.cnvd.org.cn"

def main():
    for u in ur:
        url = urli+u
        getDataByurl(url)
        
def getDataByurl(url):
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'lxml')
    href = soup.find_all('a')
    for h in href:
        try:
            time.sleep(10);#反爬虫检测机制
            urll = urlii+h.get('href')
            response1 = requests.get(url=urll,headers=headers)
            response1.encoding = 'utf-8'
            soup1 = BeautifulSoup(response1.text, 'lxml')
            title = soup1.find('h1').text
            urlll = urll
            all = soup1.find('table',class_="gg_detail").select('td')
            date = all[3].get_text().strip()
            level = all[5].get_text().strip()[0]
            product = all[7].get_text().strip()
            hole = all[9].get_text().strip()
            func = all[15].get_text().strip()
            patch =  all[17].get_text().strip()
            sql = "INSERT into reptile (url,title, date,`level`,product,hole,func,patch) values ('"+urlll+"','"+title+"','"+date+"','"+level+"','"+product+"','"+hole+"','"+func+"','"+patch+"');"
            query(sql)
        except Exception as e:
            print('error')
        finally:
       
        
def query(sql):
    #参数 charset='utf8mb4' 
    conn = pymysql.connect(host=hosts, port=ports, user=users, passwd=passwds, db=dbs)
    cur = conn.cursor()
    reCount = cur.execute(sql)
    conn.commit(); 
    cur.close();
    conn.close();

if __name__ == '__main__':
    main();
    
    
#href = soup.find_all('a',class_="current")
# li = soup.find('li')
# print('find_li:',li)
# print('li.text(返回标签的内容):',li.text)
# print('li.attrs(返回标签的属性):',li.attrs)
# print('li.string(返回标签内容为字符串):',li.string)
# 常用通过find_all()方法来查找标签元素:<>.find_all(name, attrs, recursive, string, **kwargs) ,返回一个列表类型,存储查找的结果 

# • name:对标签名称的检索字符串
# • attrs:对标签属性值的检索字符串,可标注属性检索
# • recursive:是否对子孙全部检索,默认True
# • string:<>…</>中字符串区域的检索字符串

# soup = BeautifulSoup(html, 'lxml') 
# print(type(soup.select('title'))) 
# print(soup.select('title')[0].get_text()) 
# for title in soup.select('title'):     
# print(title.get_text()) 
 

 

完美循环

# _*_ coding:utf-8 _*_
#安装python3.5
#tar -zxvf Python-3.5.1.tar
#cd Python-3.5.1
# ./configure --prefix=/usr/local/python3
# make && make install
#软连接
# ln -s /usr/local/python3/bin/python3.5 /usr/bin/python3
# ln -s /usr/local/python3/bin/pip3.5 /usr/bin/pip3
#依赖包
#yum -y install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel

#运行需要用python3 和pip3

#安装  源换成阿里的  #必须用pip3
#pip3 install selenium -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install requests -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install beautifulsoup4 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install lxml  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install pymysql  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com

#报错HTTPSConnectionPool(host='www.cnvd.org.cn', port=443): Max retries exceeded with url: /flaw/show/CNVD-2020-45316
#pip3 install cryptography  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install pyOpenSSL  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install certifi  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
import requests
from bs4 import BeautifulSoup
import time
import pymysql
#mysql配置
hosts='xx'
ports=3306
users='xxxx'
passwds='xxxx!'
dbs='xxxx'

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
'Connection':'close'}
ur = ['29','32','28','27','30','31']
#国家信息安全漏洞共享平台
urli = "https://www.cnvd.org.cn/flaw/typeResult?typeId=";
urlii = "https://www.cnvd.org.cn"

def main():
    for u in ur:
        url = urli+u
        getDataByurl(url)
        
def getDataByurl(url):
    response = requests.get(url=url,headers=headers,verify=False)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'lxml')
    href = soup.find_all('a')
#    print(href)
    for h in href:
        try:
            time.sleep(20);#反爬虫检测机制
            urll = urlii+h.get('href')
            print(urll)
            
            find1 = "offset"
            page = find1 in urll
            if page:
                response1 = requests.get(url=urll,headers=headers,verify=False)
                response1.encoding = 'utf-8'
                soup1 = BeautifulSoup(response1.text, 'lxml')
#                print(response1.text)
                
                href1 = soup1.find_all('a')
#                print(href1)
                for hh in href1:
                    try:
                        time.sleep(20);#反爬虫检测机制
                        url2 = urlii+hh.get('href')
                        print(url2)
                        # TODO 增加连接重试次数
                        requests.adapters.DEFAULT_RETRIES = 5
                        response2 = requests.get(url=url2,headers=headers,verify=False)
                        response2.encoding = 'utf-8'
                        soup2 = BeautifulSoup(response2.text, 'lxml')
#                        print(response2.text)
                        title = soup2.find('h1').text
                        urlll = url2
                        all = soup2.find('table',class_="gg_detail").select('td')
                        date = all[3].get_text().strip()
                        level = all[5].get_text().strip()[0]
                        product = all[7].get_text().strip()
                        hole = all[9].get_text().strip()
                        func = all[15].get_text().strip()
                        patch =  all[17].get_text().strip()
                        sql = "INSERT into reptile (url,title, date,`level`,product,hole,func,patch) values ('"+urlll+"','"+title+"','"+date+"','"+level+"','"+product+"','"+hole+"','"+func+"','"+patch+"');"
                        print(sql)
                        query(sql)
                    except Exception as e:
                        print('error 2')
                        print(e)
            else:
                # TODO 增加连接重试次数
                requests.adapters.DEFAULT_RETRIES = 5
                response1 = requests.get(url=urll,headers=headers,verify=False)
                response1.encoding = 'utf-8'
                soup1 = BeautifulSoup(response1.text, 'lxml')
#                print(response1.text)
                title = soup1.find('h1').text
                urlll = urll
                all = soup1.find('table',class_="gg_detail").select('td')
                date = all[3].get_text().strip()
                level = all[5].get_text().strip()[0]
                product = all[7].get_text().strip()
                hole = all[9].get_text().strip()
                func = all[15].get_text().strip()
                patch =  all[17].get_text().strip()
                sql = "INSERT into reptile (url,title, date,`level`,product,hole,func,patch) values ('"+urlll+"','"+title+"','"+date+"','"+level+"','"+product+"','"+hole+"','"+func+"','"+patch+"');"
                print(sql)
                query(sql)
        except Exception as e:
            print('error 1')
            print(e)
       
def query(sql):
    #参数 charset='utf8mb4' 
    try:
        print(33333)
        conn = pymysql.connect(host=hosts, port=ports, user=users, passwd=passwds, db=dbs)
        print(4444)
        cur = conn.cursor()
        print(5555)
        reCount = cur.execute(sql)
        print(6666)
        conn.commit(); 
        print(7777)
        cur.close();
        conn.close();
    except Exception as e:
           print('error 3')
           print(e)

if __name__ == '__main__':
    main();
 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值