# 浏览器下载的har提取带swf的url
import pandas as pd
import json
import re
from collections import defaultdict
# 读取 HAR 文件
with open(r"F:\临时文件夹\素材\mole.61.com.har", 'r', encoding='utf-8') as f:
har_data = json.load(f)
# 定义正则表达式
pattern = re.compile(r'^http://.*\.swf\?.*')
# 提取符合条件的 URL 并去掉 '?' 后的内容
filtered_urls = []
for entry in har_data['log']['entries']:
url = entry['request']['url']
if pattern.match(url):
base_url = url.split('?')[0]
filtered_urls.append(base_url)
df_url=pd.DataFrame(filtered_urls)
df_url=df_url.drop_duplicates(keep="first")
df_url=df_url.sort_values(by=0)
df_url.to_csv(r"F:\临时文件夹\素材\filtered_links3.txt",index=False)
去掉重复链接并排序