import requests
import urllib.request
from bs4 import BeautifulSoup
import json
def bs_parse_names(html):
global tname_list
tname_list = []
soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为list-group-item的div标签
div_list = soup.find_all('div', class_='list-group-item')# 通过find定位标签
# 获取每个div中的b中的span(第一个),并获取其文本
for each in div_list:
tname = each.b.text.strip()
tname_list.append(tname)
return tname_list
def get_tname():
response=urllib.request.urlopen('http://www.ieee-security.org/TC/SP2019/program-papers.html')#要爬取的网页链接
return bs_parse_names(response.read().decode('utf-8'))
def write_to_file(content):#写入文件
with open('19ssp.txt','a',encoding='utf-8') as f:
print(type(json.dumps(content)))
f.write(json.dumps(content,ensure_ascii=False)+'\n')
tname = get_tname()
print(tname)
for each in tname_list:
write_to_file(each)
A record | 爬取论文名
最新推荐文章于 2024-03-06 15:16:25 发布