一、安装Python环境
进入python官网https://www.python.org/getit/或https://www.python.org/downloads/下载python安装包,选择适合你操作系统的版本(建议选择最新的稳定版本)。
下载完安装包后双击安装,关键的地方看下图
第三步的Add python.exe to PATH 记得勾上,方便后面在命令行窗口直接敲python相关的命令,如果忘记勾选,后期可以自己在 我的电脑->属性->高级系统设置->环境变量->选中path->编辑->新建(将python安装路径填充即可,例如:C:\python\)
二、在windows命令行窗口查看是否成功
输入python命令,成功则会返回python版本等信息
三、安装python插件
1.更新pip
pip install --upgrade pip
因为本地已经是最新的了,所以再次执行的提示是这样的
2.安装mysql-connector
mysql-connector是 MySQL 官方提供的驱动器(国内有点慢...)
python -m pip install mysql-connector
或者安装pymysql
python -m pip install pymysql
3.安装chrome selenium
python -m pip install selenium
4.安装redis
python -m pip install redis
四、操作数据库类MySQLConnection
创建文件MySQLConnection.py
import mysql.connector
from mysql.connector import Error
class MySQLConnection:
def __init__(self, host='127.0.0.1', user='root', password='123456', database='test',port=3306,charset='utf8mb4'):
self.host = host
self.port = port
self.user = user
self.password = password
self.database = database
self.charset = charset
self.connection = None
def connect(self):
try:
self.connection = mysql.connector.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
database=self.database,
charset=self.charset,
buffered=True
)
if self.connection.is_connected():
print("Connected to MySQL database")
except Error as e:
print(f"Error: {e}")
def disconnect(self):
if self.connection and self.connection.is_connected():
self.connection.close()
print("Disconnected from MySQL database")
def execute_query(self, query, params=None):
if not self.connection or not self.connection.is_connected():
print("Not connected to the database")
return None
cursor = self.connection.cursor(dictionary=True)
try:
if params:
cursor.execute(query, params)
else:
cursor.execute(query)
self.connection.commit()
return cursor
except Error as e:
print(f"Error executing query: {e}")
return None
finally:
cursor.close()
def fetch_all(self, query, params=None):
cursor = self.execute_query(query, params)
if cursor:
return cursor.fetchall()
return []
def fetch_one(self, query, params=None):
cursor = self.execute_query(query, params)
if cursor:
return cursor.fetchone()
return None
# 使用示例
#if __name__ == "__main__":
# db = MySQLConnection(host="your_host", user="your_user", password="your_password", database="your_database")
# db.connect()
# 执行查询
# query = "SELECT * FROM your_table WHERE column_name = %s"
# params = ("some_value",)
# rows = db.fetch_all(query, params)
# for row in rows:
# print(row)
# db.disconnect()
五、采集脚本
创建文件caiji_ip.py
from selenium import webdriver
from MySQLConnection import MySQLConnection
import re
import time
import json
#正则替换字符串中的标签
def remove_tags(html_str):
return re.sub(r'<.*?>', '', html_str)
#正则匹配地址
def getIpAddrByHtml(html):
#去掉换行
html=html.replace('\n', '')
#匹配内容
#pattern = r'<td[^>]*>(.*?)</td>'
pattern = r'<td class="th">归属地</td><td><span>(.*?)</span>'
# 使用 findall 方法找到所有匹配的 <td> 标签内容
rs = re.findall(pattern, html)
# 打印页面源代码
#print(html)
if len(rs) > 0:
return remove_tags(rs[0])
else:
return ''
#正则匹配地址2
def getIpAddrByHtml2(html):
#去掉换行
html=html.replace('\n', '')
#匹配内容
#pattern = r'<td[^>]*>(.*?)</td>'
pattern = r'<tr class="active"><td class="th">ASN归属地</td><td><span>(.*?)</span>'
# 使用 findall 方法找到所有匹配的 <td> 标签内容
rs = re.findall(pattern, html)
# 打印页面源代码
#print(html)
if len(rs) > 0:
return rs[0]
else:
return ''
#正则匹配地址3
def getIpAddrByHtml3(html):
#去掉换行
html=html.replace('\n', '')
return remove_tags(html)
def getIpList():
db = MySQLConnection()
db.connect()
# 执行查询
#query = "SELECT a.* FROM ip a"
rows = db.fetch_all(query)
#for row in rows:
# print(row)
db.disconnect()
return rows
def decodeIpAddr(row,ipAddr):
#https://www.ipshudi.com/127.0.0.1.htm
#逻辑规则根据实际需求自行调整
#中国 湖北省 武汉市 江夏区
ipAddrArr = ipAddr.split()
count=len(ipAddrArr)
if count>1:
row['province'] = ipAddrArr[1]
row['area'] = ipAddrArr[0]+ipAddrArr[1]
if count>2 and row['city'] == '':
row['city'] = ipAddrArr[2]
row['area'] = row['area']+ipAddrArr[2]
if count>3 and row['county'] == '':
row['county'] = ipAddrArr[3]
row['area'] = row['area']+ipAddrArr[3]
return row
def decodeIpAddr2(row,ipAddr):
#https://www.ip138.com/iplookup.php?ip=127.0.0.1&action=2
#逻辑规则根据实际需求自行调整
#中国湖北省武汉市江夏区
ipAddr=ipAddr.replace('中国', '')
if '省' in ipAddr:
ipAddrArr = ipAddr.split('省')
row['province'] = ipAddrArr[0]
row['area'] = '中国'+ipAddrArr[0]+'省'
ipAddrArr = ipAddrArr[1].split('市')
count=len(ipAddrArr)
if count==2:
row['city'] = ipAddrArr[0]+'市'
row['area'] = row['area']+ipAddrArr[0]+'市'
if ipAddrArr[1] != '':
row['county'] = ipAddrArr[1]
row['area'] = row['area']+ipAddrArr[1]
else:
row['city'] = ipAddrArr[0]
row['area'] = row['area']+ipAddrArr[0]
else:
#直辖市
dictCityDirectlys = ['北京','天津','上海','重庆']
for dictCityDirectly in dictCityDirectlys:
if dictCityDirectly in ipAddr:
row['province'] = dictCityDirectly
row['city'] = dictCityDirectly+"市"
row['area'] = '中国'+ipAddr
#自治区
dictCityDirectlys = ['新疆','内蒙古','西藏','宁夏']
for dictCityDirectly in dictCityDirectlys:
if dictCityDirectly in ipAddr:
row['province'] = dictCityDirectly
#row['city'] = dictCityDirectly+"市"
row['area'] = '中国'+ipAddr
return row
def decodeIpAddr3(row,ipAddr):
#https://qifu-api.baidubce.com/ip/geo/v1/district?ip=127.0.0.1
#逻辑规则根据实际需求自行调整
#返回的是JSON格式字符串
ipAddrArr=json.loads(ipAddr)
if ipAddrArr['data'] and ipAddrArr['data']['prov']:
row['country'] = ipAddrArr['data']['country']
row['province'] = ipAddrArr['data']['prov']
row['city'] = ipAddrArr['data']['city']
row['isp'] = ipAddrArr['data']['isp']
row['area'] = row['country']+row['province']+row['city']
return row
def updateDbIpAddr(ipAddrArr):
if ipAddrArr['province']=='':
print("省份未知")
return ''
db = MySQLConnection()
db.connect()
# 执行更新
sql = "UPDATE ip a SET a.country='"+ipAddrArr['country']+"',a.province='"+ipAddrArr['province']+"',a.city='"+ipAddrArr['city']+"',a.county='"+ipAddrArr['county']+"',a.area='"+ipAddrArr['area']+"',a.isp='"+ipAddrArr['isp']+"' WHERE a.id="+str(ipAddrArr['id'])
print(sql)
# 打开文件,如果文件不存在则创建,以写入模式('a')打开
with open("updateDbIpAddr.sql", "a", encoding="utf-8") as file:
# 将内容追加入文件
file.write(sql+";\n")
file.close()
rt = db.execute_query(sql)
db.disconnect()
if rt:
print("更新成功")
else:
print("更新失败")
return rt
def main():
rows=getIpList()
# 确保已经下载了ChromeDriver,并且它在系统的PATH中
# 下载地址:https://sites.google.com/a/chromium.org/chromedriver/
# 例如,如果你在Windows上,下载后将chromedriver.exe放在C:\Windows\System32下
if len(rows) == 0:
print("无数据")
return ""
# 创建一个WebDriver对象,指定使用Chrome浏览器
print("启动chrome")
for row in rows:
try:
driver = webdriver.Chrome()
# 采集频繁会被封IP
url = "https://www.ipshudi.com/"+row['ip']+".htm"
# 采集频繁会被封IP
url = "https://www.ip138.com/iplookup.php?ip="+row['ip']+"&action=2"
url = "https://qifu-api.baidubce.com/ip/geo/v1/district?ip="+row['ip']
print("采集URL:"+url)
driver.get(url)
print("解析HTML")
ipAddr = getIpAddrByHtml3(driver.page_source)
print(row['ip']+" : "+ipAddr)
if ipAddr != '':
print("更新IP地址信息")
row = decodeIpAddr3(row,ipAddr)
updateDbIpAddr(row)
driver.quit()
print("关闭chrome")
time.sleep(2)
except Exception:
driver.quit()
main()