# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import re
import xlwt
from urllib.parse import quote
import json
def main():
baseurl = 'https://baike.baidu.com/item/'
print("0")
#爬取网页
datalist = getdata(baseurl)
def school_list(filename):
school = []
fr = open(filename, 'r', encoding='utf-8')
lines = fr.readlines()
for line in lines:
school.append(line.strip('\n'))
fr.close()
return school
def getdata(baseurl):
print("1")
school = school_list('school.txt')
datalist=[]
for index in school:
url = baseurl + urllib.parse.quote(index)
html=askurl(url)
#逐一解析
soup = BeautifulSoup(html,"html.parser")
name_data = []
value_data = []
name_node = soup.find_all('dt', class_='basicInfo-item name')
for i in range(len(name_node)):
name_data.append(name_node[i].get_text().replace('\xa0', ''))
# name_data.append(name_node[i].get_text())
#
value_node = soup.find_all('dd', class_='basicInfo-item value')
for i in range(len(value_node)):
value_data.append(value_node[i].get_text().replace('\n', ''))
# print(type(value_node[i].get_text().replace('\n', '')))
# print(value_node[i].get_text().replace('\n', ''))
# print(value_data)
# print(type(value_data))
result = {'中文名': '无信息', '英文名':
高校、专利信息
于 2022-03-08 21:13:56 首次发布