python爬虫

最新推荐文章于 2024-07-19 03:57:34 发布

跟着爸爸走

最新推荐文章于 2024-07-19 03:57:34 发布

阅读量277

点赞数

分类专栏： linux

本文链接：https://blog.csdn.net/heqiang1995/article/details/105558771

版权

linux 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

sed -i -e 's/\r$//' install_python.sh windwos编码转linux

sudo !! 获取root权限

阿里云的rpm
http://mirrors.aliyun.com/centos/7/os/x86_64/Packages/
缓存rpm文件解决依赖下载rpm文件离线安装
https://www.cnblogs.com/nmap/p/9511848.html

sudo rpm -Uvh ./gcc/*.rpm --nodeps --force 强制安装

sudo !! root权限
./configure --prefix=/usr 指定路径
make Makefile编译 gcc
make install 安装 make uninstall 是卸载
CC是gcc的连接，gcc是编译器 gcc包含很多编译器(C, C++, Objective-C, Ada, Fortran,and 　 Java)

第一版本

#!/bin/sh
#!/bin/bash
#用python3.8.2 windwos脚本和linux脚本编码不一样
echo "定义参数"
python_path="/opt/python/"
sh_path="/opt/nssa/"

sudo rpm -Uvh ./python/rpm/*.rpm --nodeps --force

if [ -d $python_path ]; then
sudo rm -rf $python_path
fi
sudo tar -xvf ./python/Python-v3.8.2.tgz
sudo mv Python-3.8.2 /opt
sudo mv /opt/Python-3.8.2 /opt/python

sudo cp -r ./python/packages ${python_path}
sudo cp ./python/requirements.txt ${python_path}

if [ -d /opt/python3 ]; then
sudo rm -rf /opt/python3
fi

#中间临时文件加权限
sudo mkdir /opt/python3
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*

cd $python_path
sudo ./configure --prefix=/opt/python3
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*
sudo make
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*
sudo make install

#软连接
sudo rm -rf /usr/bin/python3
sudo rm -rf /usr/bin/pip3
sudo ln -s /opt/python3/bin/python3.8 /usr/bin/python3
sudo ln -s /opt/python3/bin/pip3.8 /usr/bin/pip3
#安装第三方库
sudo pip3 install --no-index --find-links=$python_path/packages -r $python_path/requirements.txt
cd $sh_path
sudo python3 CNVD.py >/dev/null 2>&1 &

第二版本

#!/bin/sh
#!/bin/bash
#用python3.8.2 windwos脚本和linux脚本编码不一样
echo "定义参数"
python_path="/opt/python/"
sh_path="/opt/nssa/"

if [ -d $python_path ]; then
sudo rm -rf $python_path
fi

sudo tar -xvf ./python/Python-v3.8.2.tgz
sudo mv Python-3.8.2 /opt
sudo mv /opt/Python-3.8.2 /opt/python

sudo cp -r ./python/packages ${python_path}
sudo cp ./python/requirements.txt ${python_path}

cd $python_path
./configure --prefix=/usr/local/python3
sudo make && make install
#软连接
sudo rm -rf /usr/bin/python3
sudo rm -rf /usr/bin/pip3
sudo ln -s /usr/local/python3/bin/python3.8 /usr/bin/python3
sudo ln -s /usr/local/python3/bin/pip3.8 /usr/bin/pip3
#安装第三方库 pip3 list查看第三方 pip3 show requests版本和安装位置
#离线 pip3 freeze > requirements.txt 离线安装 pip3 install --no-index --find-links=DIR -r requirements.txt
#打包也需要切源pip3 download -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -d DIR -r requirements.txt
#sudo pip3 install requests -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install beautifulsoup4 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install lxml -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install pymysql -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
sudo pip3 install --no-index --find-links=${python_path}packages -r ${python_path}requirements.txt
cd $sh_path
sudo python3 CNVD.py

import requests
from bs4 import BeautifulSoup
import time
import pymysql
#mysql配置
hosts='XXXX'
ports=XXXX
users='XXXX'
passwds='XXXX'
dbs='XXXX'

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
ur = ['29','32','28','27','30','31']
#国家信息安全漏洞共享平台
urli = "https://www.cnvd.org.cn/flaw/typeResult?typeId=";
urlii = "https://www.cnvd.org.cn"

def main():
for u in ur:
url = urli+u
getDataByurl(url)

def getDataByurl(url):
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
href = soup.find_all('a')
for h in href:
try:
time.sleep(10);#反爬虫检测机制
urll = urlii+h.get('href')
response1 = requests.get(url=urll,headers=headers)
response1.encoding = 'utf-8'
soup1 = BeautifulSoup(response1.text, 'lxml')
title = soup1.find('h1').text
urlll = urll
all = soup1.find('table',class_="gg_detail").select('td')
date = all[3].get_text().strip()
level = all[5].get_text().strip()[0]
product = all[7].get_text().strip()
hole = all[9].get_text().strip()
func = all[15].get_text().strip()
patch = all[17].get_text().strip()
sql = "INSERT into reptile (url,title, date,`level`,product,hole,func,patch) values ('"+urlll+"','"+title+"','"+date+"','"+level+"','"+product+"','"+hole+"','"+func+"','"+patch+"');"
query(sql)
except Exception as e:
print('error')
finally:

def query(sql):
#参数 charset='utf8mb4'
conn = pymysql.connect(host=hosts, port=ports, user=users, passwd=passwds, db=dbs)
cur = conn.cursor()
reCount = cur.execute(sql)
conn.commit();
cur.close();
conn.close();

if __name__ == '__main__':
main();

#href = soup.find_all('a',class_="current")
# li = soup.find('li')
# print('find_li:',li)
# print('li.text(返回标签的内容):',li.text)
# print('li.attrs(返回标签的属性):',li.attrs)
# print('li.string(返回标签内容为字符串):',li.string)
# 常用通过find_all()方法来查找标签元素：<>.find_all(name, attrs, recursive, string, **kwargs) ，返回一个列表类型，存储查找的结果

# • name：对标签名称的检索字符串
# • attrs：对标签属性值的检索字符串，可标注属性检索
# • recursive：是否对子孙全部检索，默认True
# • string：<>…</>中字符串区域的检索字符串

# soup = BeautifulSoup(html, 'lxml')
# print(type(soup.select('title')))
# print(soup.select('title')[0].get_text())
# for title in soup.select('title'):
# print(title.get_text())

完美循环

# _*_ coding:utf-8 _*_
#安装python3.5
#tar -zxvf Python-3.5.1.tar
#cd Python-3.5.1
# ./configure --prefix=/usr/local/python3
# make && make install
#软连接
# ln -s /usr/local/python3/bin/python3.5 /usr/bin/python3
# ln -s /usr/local/python3/bin/pip3.5 /usr/bin/pip3
#依赖包
#yum -y install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel

#运行需要用python3 和pip3

#安装源换成阿里的 #必须用pip3
#pip3 install selenium -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install requests -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install beautifulsoup4 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install lxml -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install pymysql -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com

#报错HTTPSConnectionPool(host='www.cnvd.org.cn', port=443): Max retries exceeded with url: /flaw/show/CNVD-2020-45316
#pip3 install cryptography -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install pyOpenSSL -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#pip3 install certifi -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
import requests
from bs4 import BeautifulSoup
import time
import pymysql
#mysql配置
hosts='xx'
ports=3306
users='xxxx'
passwds='xxxx!'
dbs='xxxx'

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
'Connection':'close'}
ur = ['29','32','28','27','30','31']
#国家信息安全漏洞共享平台
urli = "https://www.cnvd.org.cn/flaw/typeResult?typeId=";
urlii = "https://www.cnvd.org.cn"

def main():
for u in ur:
url = urli+u
getDataByurl(url)

def getDataByurl(url):
response = requests.get(url=url,headers=headers,verify=False)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
href = soup.find_all('a')
# print(href)
for h in href:
try:
time.sleep(20);#反爬虫检测机制
urll = urlii+h.get('href')
print(urll)

find1 = "offset"
page = find1 in urll
if page:
response1 = requests.get(url=urll,headers=headers,verify=False)
response1.encoding = 'utf-8'
soup1 = BeautifulSoup(response1.text, 'lxml')
# print(response1.text)

href1 = soup1.find_all('a')
# print(href1)
for hh in href1:
try:
time.sleep(20);#反爬虫检测机制
url2 = urlii+hh.get('href')
print(url2)
# TODO 增加连接重试次数
requests.adapters.DEFAULT_RETRIES = 5
response2 = requests.get(url=url2,headers=headers,verify=False)
response2.encoding = 'utf-8'
soup2 = BeautifulSoup(response2.text, 'lxml')
# print(response2.text)
title = soup2.find('h1').text
urlll = url2
all = soup2.find('table',class_="gg_detail").select('td')
date = all[3].get_text().strip()
level = all[5].get_text().strip()[0]
product = all[7].get_text().strip()
hole = all[9].get_text().strip()
func = all[15].get_text().strip()
patch = all[17].get_text().strip()
sql = "INSERT into reptile (url,title, date,`level`,product,hole,func,patch) values ('"+urlll+"','"+title+"','"+date+"','"+level+"','"+product+"','"+hole+"','"+func+"','"+patch+"');"
print(sql)
query(sql)
except Exception as e:
print('error 2')
print(e)
else:
# TODO 增加连接重试次数
requests.adapters.DEFAULT_RETRIES = 5
response1 = requests.get(url=urll,headers=headers,verify=False)
response1.encoding = 'utf-8'
soup1 = BeautifulSoup(response1.text, 'lxml')
# print(response1.text)
title = soup1.find('h1').text
urlll = urll
all = soup1.find('table',class_="gg_detail").select('td')
date = all[3].get_text().strip()
level = all[5].get_text().strip()[0]
product = all[7].get_text().strip()
hole = all[9].get_text().strip()
func = all[15].get_text().strip()
patch = all[17].get_text().strip()
sql = "INSERT into reptile (url,title, date,`level`,product,hole,func,patch) values ('"+urlll+"','"+title+"','"+date+"','"+level+"','"+product+"','"+hole+"','"+func+"','"+patch+"');"
print(sql)
query(sql)
except Exception as e:
print('error 1')
print(e)

def query(sql):
#参数 charset='utf8mb4'
try:
print(33333)
conn = pymysql.connect(host=hosts, port=ports, user=users, passwd=passwds, db=dbs)
print(4444)
cur = conn.cursor()
print(5555)
reCount = cur.execute(sql)
print(6666)
conn.commit();
print(7777)
cur.close();
conn.close();
except Exception as e:
print('error 3')
print(e)

if __name__ == '__main__':
main();

跟着爸爸走

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬虫

#coding: utf-8#安装pip install selenium -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com#谷歌导入驱动from selenium import webdriverimport time#使用正则表达式需要引入re模块import re#连接数据库...
复制链接

扫一扫

专栏目录