# coding:utf-8
import re
import requests
import xlrd
from xlutils.copy import copy
class Traversal_Web:
def __init__(self):
pass
def url(self,path,sites):
# 获取网页内容
#pqyload = {name:source}
#site="http://XXX"
i=0
r = requests.get(sites)
data = r.text
# 利用正则查找所有连接
link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,data)
for url in link_list:
i=i+1
if url.startswith("http://") or url.startswith("https://"):
r = requests.get(url)
url=r.url
status=r.status_code
else:
url = url
status='xxx'
self.write_xls(path,url,i,0)
self.write_xls(path,status,i,1)
def write_xls(self,path,text,nrow,ncol):
rb=xlrd.open_workbook(path)
rs = rb.sheet_by_index(0)
wb = copy(rb)
ws = wb.get_sheet(0)
ws.write(nrow, ncol,text)
wb.save(path)
Python实现遍历url
最新推荐文章于 2023-05-12 10:09:38 发布