python解析html并根据表格内容进行移动

最新推荐文章于 2023-02-19 23:21:33 发布

叶小北

最新推荐文章于 2023-02-19 23:21:33 发布

阅读量379

点赞数

分类专栏： python脚本开发文章标签： python

本文链接：https://blog.csdn.net/likai0/article/details/108111615

版权

python脚本开发专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import xlwt
import openpyxl
from urllib.parse import *
import xlrd
def eq(l):
    workbook = xlrd.open_workbook('data.xlsx')
    table = workbook.sheet_by_index(0)
    all_data=[]
    url=[]
    test=[]
    for i in range(0,table.nrows):
        u = table.cell(i, 1).value
        ip=table.cell(i,0).value
        #print(ip)
        if u not in l:
            print(u,"\t",ip)
#文件移动函数
def moveFile(srcfile,dstfile):
    if not os.path.isfile(srcfile):
        print ("%s 该文件不存在！请检查您的输入"%(srcfile))
    else:
        fpath,fname=os.path.split(dstfile)    #分离文件名和路径
        if not os.path.exists(fpath):
            os.makedirs(fpath)                #创建路径
        shutil.move(srcfile,dstfile)          #移动文件

def searchdata(l,dir):
    workbook=xlrd.open_workbook('C:\\Users\\yxb\\Downloads\\汇总高危\\网站基本信息20200424(1).xls')
    workbook2=xlrd.open_workbook('C:\\Users\\yxb\\Downloads\\汇总高危\\网站群网站清单_20200312入库(1).xlsx')
    table1=workbook.sheet_by_index(0)
    table2=workbook2.sheet_by_index(0)
    all_data=[]
    un=[]
    ip=[]
    url=[]
    start=[]
    end=[]
    for i in range(0,table1.nrows):
        u=table1.cell(i,4).value
        unit=table1.cell(i,-1).value
        ip.append(table1.cell(i,5).value)
        un.append(table1.cell(i,16).value)
        if u=='*' or u=='无':
            u=table1.cell(i,5).value
            all_data.append(unit)
            url.append(u)
        url.append(u)
        all_data.append(unit)
    for i in range(0,table2.nrows):
        u=table2.cell(i,0).value
        unit=table2.cell(i,3).value
        url.append(u)
        all_data.append(unit)
    sum=0
    for i in l:
        if i in url:
            print(i,"\t",all_data[url.index(i)])
            start.append(i)
            end.append(all_data[url.index(i)])
        else:
            if i in ip:
                print(i,"\t",un[ip.index(i)])
                start.append(i)
                end.append(all_data[ip.index(i)])
    list = os.listdir(dir)
    for i in range(0, len(list)):
        path = os.path.join(dir, list[i])
        if os.path.isfile(path):
            with open(path, encoding="utf-8") as f:
                content = f.read()
            doc = pq(content)  # 解析html 文本
            item = doc("h1")
            s=((item.eq(2).text()))
            res = urlparse(s)
            # print(res)
            if s in start:
                moveFile(path,"F:\\scrapy\\819\\"+end[start.index(s)]+"\\")
            else:
                if res.scheme == 'http' or res.scheme is None or res.scheme == '' or res.scheme == 'https':
                    if res.netloc == '':
                        # print(res.path)
                        if res.path in start:
                            moveFile(path, "F:\\scrapy\\819\\"+end[start.index(res.path)]+"\\")
                    else:
                        if res.netloc in start:
                            moveFile(path, "F:\\scrapy\\819\\"+end[start.index(res.netloc)]+"\\")
                        # list.append(res.netloc)
                        # print(res.netloc)
                else:
                    if res.scheme in start:
                        moveFile(path, "F:\\scrapy\\819\\"+end[start.index(res.scheme)]+"\\")
                    # listUrl.append(res.scheme)
                    # print(res.scheme)
    print('操作完成')









def chooseInfo(dir):
    l=[]
    listUrl=[]
    list = os.listdir(dir)
    for i in range(0, len(list)):
        path = os.path.join(dir, list[i])
        if os.path.isfile(path):
            with open(path, encoding="utf-8") as f:
                content = f.read()
            doc = pq(content)  # 解析html 文本
            item = doc("h1")
            s=((item.eq(2).text()))
            #if int(item.eq(1).html()) > 0 or int(item.eq(3).html()) > 0:
            #parrten='^?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(/)'
            #a=re.findall('(?:[-\w.]|(?:%[\da-fA-F]{2}))+',s)
            #a=re.split('(?:[-\w.]|(?:%[\da-fA-F]{2}))+',a)
            #print(a)
            l.append(s)

    #print("操作全部完成！")

    for url in l:
        res=urlparse(url)
        #print(res)
        if res.scheme=='http' or res.scheme is None or res.scheme=='' or res.scheme=='https':
            if res.netloc =='':
                #print(res.path)
                listUrl.append(res.path)
            else:
                list.append(res.netloc)
                #print(res.netloc)
        else:
            listUrl.append(res.scheme)
            #print(res.scheme)
    return listUrl



a=[]
a=chooseInfo("C:\\Users\\yxb\\Downloads\\汇总高危\\总\\")
searchdata(a,"C:\\Users\\yxb\\Downloads\\汇总高危\\总\\")
#eq(a)

叶小北

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python解析html并根据表格内容进行移动

import xlwtimport openpyxlfrom urllib.parse import *import xlrddef eq(l): workbook = xlrd.open_workbook('data.xlsx') table = workbook.sheet_by_index(0) all_data=[] url=[] test=[] for i in range(0,table.nrows): u = table.
复制链接

扫一扫