Windows下Python3采集IP地址

一、安装Python环境

进入python官网https://www.python.org/getit/或https://www.python.org/downloads/下载python安装包,选择适合你操作系统的版本(建议选择最新的稳定版本)。

下载完安装包后双击安装,关键的地方看下图

第三步的Add python.exe to PATH 记得勾上,方便后面在命令行窗口直接敲python相关的命令,如果忘记勾选,后期可以自己在 我的电脑->属性->高级系统设置->环境变量->选中path->编辑->新建(将python安装路径填充即可,例如:C:\python\)

二、在windows命令行窗口查看是否成功

输入python命令,成功则会返回python版本等信息

三、安装python插件

1.更新pip

pip install --upgrade pip

 

因为本地已经是最新的了,所以再次执行的提示是这样的

2.安装mysql-connector

mysql-connector是 MySQL 官方提供的驱动器(国内有点慢...)

python -m pip install mysql-connector

或者安装pymysql

python -m pip install pymysql

3.安装chrome selenium

python -m pip install selenium

4.安装redis

python -m pip install redis

四、操作数据库类MySQLConnection

创建文件MySQLConnection.py

import mysql.connector
from mysql.connector import Error

class MySQLConnection:
    def __init__(self, host='127.0.0.1', user='root', password='123456', database='test',port=3306,charset='utf8mb4'):
        self.host = host
        self.port = port
        self.user = user
        self.password = password
        self.database = database
        self.charset = charset
        self.connection = None

    def connect(self):
        try:
            self.connection = mysql.connector.connect(
                host=self.host,
                port=self.port,
                user=self.user,
                password=self.password,
                database=self.database,
                charset=self.charset,
                buffered=True
            )
            if self.connection.is_connected():
                print("Connected to MySQL database")
        except Error as e:
            print(f"Error: {e}")

    def disconnect(self):
        if self.connection and self.connection.is_connected():
            self.connection.close()
            print("Disconnected from MySQL database")

    def execute_query(self, query, params=None):
        if not self.connection or not self.connection.is_connected():
            print("Not connected to the database")
            return None

        cursor = self.connection.cursor(dictionary=True)
        try:
            if params:
                cursor.execute(query, params)
            else:
                cursor.execute(query)

            self.connection.commit()
            return cursor
        except Error as e:
            print(f"Error executing query: {e}")
            return None
        finally:
            cursor.close()

    def fetch_all(self, query, params=None):
        cursor = self.execute_query(query, params)
        if cursor:
            return cursor.fetchall()
        return []

    def fetch_one(self, query, params=None):
        cursor = self.execute_query(query, params)
        if cursor:
            return cursor.fetchone()
        return None

# 使用示例
#if __name__ == "__main__":
#    db = MySQLConnection(host="your_host", user="your_user", password="your_password", database="your_database")
#    db.connect()

    # 执行查询
#    query = "SELECT * FROM your_table WHERE column_name = %s"
#    params = ("some_value",)
#    rows = db.fetch_all(query, params)

#   for row in rows:
#        print(row)

#    db.disconnect()

五、采集脚本

创建文件caiji_ip.py

from selenium import webdriver
from MySQLConnection import MySQLConnection

import re
import time
import json


#正则替换字符串中的标签
def remove_tags(html_str):
    return re.sub(r'<.*?>', '', html_str)

#正则匹配地址
def getIpAddrByHtml(html):
    #去掉换行
    html=html.replace('\n', '')

    #匹配内容
    #pattern = r'<td[^>]*>(.*?)</td>'
    pattern = r'<td class="th">归属地</td><td><span>(.*?)</span>'

    # 使用 findall 方法找到所有匹配的 <td> 标签内容
    rs = re.findall(pattern, html)

    # 打印页面源代码
    #print(html)
    if len(rs) > 0:
        return remove_tags(rs[0])
    else:
        return ''
        
#正则匹配地址2
def getIpAddrByHtml2(html):
    #去掉换行
    html=html.replace('\n', '')

    #匹配内容
    #pattern = r'<td[^>]*>(.*?)</td>'
    pattern = r'<tr class="active"><td class="th">ASN归属地</td><td><span>(.*?)</span>'

    # 使用 findall 方法找到所有匹配的 <td> 标签内容
    rs = re.findall(pattern, html)

    # 打印页面源代码
    #print(html)
    if len(rs) > 0:
        return rs[0]
    else:
        return ''

#正则匹配地址3
def getIpAddrByHtml3(html):
    #去掉换行
    html=html.replace('\n', '')
    return remove_tags(html)

def getIpList():
    db = MySQLConnection()
    db.connect()

    # 执行查询
    #query = "SELECT a.* FROM ip a"
    rows = db.fetch_all(query)
    #for row in rows:
    #    print(row)
    db.disconnect()
    return rows

def decodeIpAddr(row,ipAddr):
    #https://www.ipshudi.com/127.0.0.1.htm
    #逻辑规则根据实际需求自行调整
    #中国 湖北省 武汉市 江夏区
    ipAddrArr = ipAddr.split()
    count=len(ipAddrArr)
    if count>1:
        row['province'] = ipAddrArr[1]
        row['area'] = ipAddrArr[0]+ipAddrArr[1]
    if count>2 and row['city'] == '':
        row['city'] = ipAddrArr[2]
        row['area'] = row['area']+ipAddrArr[2]
    if count>3 and row['county'] == '':
        row['county'] = ipAddrArr[3]
        row['area'] = row['area']+ipAddrArr[3]
    return row

def decodeIpAddr2(row,ipAddr):
    #https://www.ip138.com/iplookup.php?ip=127.0.0.1&action=2
    #逻辑规则根据实际需求自行调整
    #中国湖北省武汉市江夏区
    ipAddr=ipAddr.replace('中国', '')
    if '省' in ipAddr:
        ipAddrArr = ipAddr.split('省')
        row['province'] = ipAddrArr[0]
        row['area'] = '中国'+ipAddrArr[0]+'省'
        ipAddrArr = ipAddrArr[1].split('市')
        count=len(ipAddrArr)
        if count==2:
            row['city'] = ipAddrArr[0]+'市'
            row['area'] = row['area']+ipAddrArr[0]+'市'
            if ipAddrArr[1] != '':
                row['county'] = ipAddrArr[1]
                row['area'] = row['area']+ipAddrArr[1]
        else:
            row['city'] = ipAddrArr[0]
            row['area'] = row['area']+ipAddrArr[0]
    else:
        #直辖市
        dictCityDirectlys = ['北京','天津','上海','重庆']
        for dictCityDirectly in dictCityDirectlys:
            if dictCityDirectly in ipAddr:
                row['province'] = dictCityDirectly
                row['city'] = dictCityDirectly+"市"
                row['area'] = '中国'+ipAddr
        #自治区
        dictCityDirectlys = ['新疆','内蒙古','西藏','宁夏']
        for dictCityDirectly in dictCityDirectlys:
            if dictCityDirectly in ipAddr:
                row['province'] = dictCityDirectly
                #row['city'] = dictCityDirectly+"市"
                row['area'] = '中国'+ipAddr
    return row

def decodeIpAddr3(row,ipAddr):
    #https://qifu-api.baidubce.com/ip/geo/v1/district?ip=127.0.0.1
    #逻辑规则根据实际需求自行调整
    #返回的是JSON格式字符串
    ipAddrArr=json.loads(ipAddr)
    if ipAddrArr['data'] and ipAddrArr['data']['prov']:
        row['country'] = ipAddrArr['data']['country']
        row['province'] = ipAddrArr['data']['prov']
        row['city'] = ipAddrArr['data']['city']
        row['isp'] = ipAddrArr['data']['isp']
        row['area'] = row['country']+row['province']+row['city']
    return row

def updateDbIpAddr(ipAddrArr):
    if ipAddrArr['province']=='':
        print("省份未知")
        return ''
    db = MySQLConnection()
    db.connect()

    # 执行更新
    sql = "UPDATE ip a SET a.country='"+ipAddrArr['country']+"',a.province='"+ipAddrArr['province']+"',a.city='"+ipAddrArr['city']+"',a.county='"+ipAddrArr['county']+"',a.area='"+ipAddrArr['area']+"',a.isp='"+ipAddrArr['isp']+"' WHERE a.id="+str(ipAddrArr['id'])
    print(sql)
    # 打开文件,如果文件不存在则创建,以写入模式('a')打开
    with open("updateDbIpAddr.sql", "a", encoding="utf-8") as file:
        # 将内容追加入文件
        file.write(sql+";\n")
        file.close()
    
    rt = db.execute_query(sql)
    db.disconnect()
    if rt:
        print("更新成功")
    else:
        print("更新失败")
    return rt
    
def main():
    rows=getIpList()
    # 确保已经下载了ChromeDriver,并且它在系统的PATH中
    # 下载地址:https://sites.google.com/a/chromium.org/chromedriver/
    # 例如,如果你在Windows上,下载后将chromedriver.exe放在C:\Windows\System32下
    if len(rows) == 0:
        print("无数据")
        return ""
    # 创建一个WebDriver对象,指定使用Chrome浏览器
    print("启动chrome")
    for row in rows:
        try:
            driver = webdriver.Chrome()
            # 采集频繁会被封IP
            url = "https://www.ipshudi.com/"+row['ip']+".htm"
            # 采集频繁会被封IP
            url = "https://www.ip138.com/iplookup.php?ip="+row['ip']+"&action=2"
            url = "https://qifu-api.baidubce.com/ip/geo/v1/district?ip="+row['ip']
            print("采集URL:"+url)
            driver.get(url)
            print("解析HTML")
            ipAddr = getIpAddrByHtml3(driver.page_source)
            print(row['ip']+" : "+ipAddr)
            if ipAddr != '':
                print("更新IP地址信息")
                row = decodeIpAddr3(row,ipAddr)
            updateDbIpAddr(row)
            driver.quit()
            print("关闭chrome")
            time.sleep(2)
        except Exception:
            driver.quit()
            
main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值