import xlwt
import openpyxl
from urllib.parse import *
import xlrd
def eq(l):
workbook = xlrd.open_workbook('data.xlsx')
table = workbook.sheet_by_index(0)
all_data=[]
url=[]
test=[]
for i in range(0,table.nrows):
u = table.cell(i, 1).value
ip=table.cell(i,0).value
#print(ip)
if u not in l:
print(u,"\t",ip)
#文件移动函数
def moveFile(srcfile,dstfile):
if not os.path.isfile(srcfile):
print ("%s 该文件不存在!请检查您的输入"%(srcfile))
else:
fpath,fname=os.path.split(dstfile) #分离文件名和路径
if not os.path.exists(fpath):
os.makedirs(fpath) #创建路径
shutil.move(srcfile,dstfile) #移动文件
def searchdata(l,dir):
workbook=xlrd.open_workbook('C:\\Users\\yxb\\Downloads\\汇总高危\\网站基本信息20200424(1).xls')
workbook2=xlrd.open_workbook('C:\\Users\\yxb\\Downloads\\汇总高危\\网站群网站清单_20200312入库(1).xlsx')
table1=workbook.sheet_by_index(0)
table2=workbook2.sheet_by_index(0)
all_data=[]
un=[]
ip=[]
url=[]
start=[]
end=[]
for i in range(0,table1.nrows):
u=table1.cell(i,4).value
unit=table1.cell(i,-1).value
ip.append(table1.cell(i,5).value)
un.append(table1.cell(i,16).value)
if u=='*' or u=='无':
u=table1.cell(i,5).value
all_data.append(unit)
url.append(u)
url.append(u)
all_data.append(unit)
for i in range(0,table2.nrows):
u=table2.cell(i,0).value
unit=table2.cell(i,3).value
url.append(u)
all_data.append(unit)
sum=0
for i in l:
if i in url:
print(i,"\t",all_data[url.index(i)])
start.append(i)
end.append(all_data[url.index(i)])
else:
if i in ip:
print(i,"\t",un[ip.index(i)])
start.append(i)
end.append(all_data[ip.index(i)])
list = os.listdir(dir)
for i in range(0, len(list)):
path = os.path.join(dir, list[i])
if os.path.isfile(path):
with open(path, encoding="utf-8") as f:
content = f.read()
doc = pq(content) # 解析html 文本
item = doc("h1")
s=((item.eq(2).text()))
res = urlparse(s)
# print(res)
if s in start:
moveFile(path,"F:\\scrapy\\819\\"+end[start.index(s)]+"\\")
else:
if res.scheme == 'http' or res.scheme is None or res.scheme == '' or res.scheme == 'https':
if res.netloc == '':
# print(res.path)
if res.path in start:
moveFile(path, "F:\\scrapy\\819\\"+end[start.index(res.path)]+"\\")
else:
if res.netloc in start:
moveFile(path, "F:\\scrapy\\819\\"+end[start.index(res.netloc)]+"\\")
# list.append(res.netloc)
# print(res.netloc)
else:
if res.scheme in start:
moveFile(path, "F:\\scrapy\\819\\"+end[start.index(res.scheme)]+"\\")
# listUrl.append(res.scheme)
# print(res.scheme)
print('操作完成')
def chooseInfo(dir):
l=[]
listUrl=[]
list = os.listdir(dir)
for i in range(0, len(list)):
path = os.path.join(dir, list[i])
if os.path.isfile(path):
with open(path, encoding="utf-8") as f:
content = f.read()
doc = pq(content) # 解析html 文本
item = doc("h1")
s=((item.eq(2).text()))
#if int(item.eq(1).html()) > 0 or int(item.eq(3).html()) > 0:
#parrten='^?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(/)'
#a=re.findall('(?:[-\w.]|(?:%[\da-fA-F]{2}))+',s)
#a=re.split('(?:[-\w.]|(?:%[\da-fA-F]{2}))+',a)
#print(a)
l.append(s)
#print("操作全部完成!")
for url in l:
res=urlparse(url)
#print(res)
if res.scheme=='http' or res.scheme is None or res.scheme=='' or res.scheme=='https':
if res.netloc =='':
#print(res.path)
listUrl.append(res.path)
else:
list.append(res.netloc)
#print(res.netloc)
else:
listUrl.append(res.scheme)
#print(res.scheme)
return listUrl
a=[]
a=chooseInfo("C:\\Users\\yxb\\Downloads\\汇总高危\\总\\")
searchdata(a,"C:\\Users\\yxb\\Downloads\\汇总高危\\总\\")
#eq(a)