Html文件读取与爬取(个人笔记篇)
html文件读取
1.通过open:打开文件地址,‘r’:读取文件,encoding:读取文件的格式
2.再通过html.read()
3.获取到html
def get_html(self, url):
html = open(url, 'r', encoding='utf-8')
h = html.read()
# print(h)
return h
解析html数据
1、利用bs4 进行解析数据
2、在通过循环利用item{}存储数据
再将保存在item中的数据存储到data_list中
def parse_data(self, html):
soup = BeautifulSoup(html, 'lxml')
tr_list = soup.find_all('tr')
for index, tr in enumerate(tr_list):
if index == 0:
continue
else:
td_list = tr.find_all('td')
item = {}
# print(td_list)
td1 = td_list[0]
item['courseId'] = td1.string
td2 = td_list[1]
item['name'] = td2.string
td3 = td_list[2]
item['university'] = td3.string
td4 = td_list[3]
item['summary'] = td4.string
td5 = td_list[4]
item['teacher'] = td5.string
td6 = td_list[5]
item['term'] = td6.string
td7 = td_list[6]
item['assessment_number'] = td7.string
td8 = td_list[7]
item['assessment_score'] = td8.string
td9 = td_list[8]
item['category'] = td9.string
self.data_list.append(item)
存储爬取到的数据
利用with open以及csv模块创建和存储爬取到的数据
def __init__(self):
self.header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/97.0.4692.99 Safari/537.36 "
}
self.head = ['courseId', 'name', 'university', 'summary', 'teacher', 'term', 'assessment_number',
'assessment_score', 'category']
self.data_list = []
def save_data(self):
with open('train.csv', 'w', encoding='utf-8', newline='') as file_obj:
DictW = csv.DictWriter(file_obj, self.head)
DictW.writeheader()
DictW.writerows(self.data_list)
完整代码展示
import csv
from bs4 import BeautifulSoup
class Html_pq():
def __init__(self):
self.header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/97.0.4692.99 Safari/537.36 "
}
self.head = ['courseId', 'name', 'university', 'summary', 'teacher', 'term', 'assessment_number',
'assessment_score', 'category']
self.data_list = []
def get_html(self, url):
html = open(url, 'r', encoding='utf-8')
h = html.read()
# print(h)
return h
def parse_data(self, html):
soup = BeautifulSoup(html, 'lxml')
tr_list = soup.find_all('tr')
for index, tr in enumerate(tr_list):
if index == 0:
continue
else:
td_list = tr.find_all('td')
item = {}
# print(td_list)
td1 = td_list[0]
item['courseId'] = td1.string
td2 = td_list[1]
item['name'] = td2.string
td3 = td_list[2]
item['university'] = td3.string
td4 = td_list[3]
item['summary'] = td4.string
td5 = td_list[4]
item['teacher'] = td5.string
td6 = td_list[5]
item['term'] = td6.string
td7 = td_list[6]
item['assessment_number'] = td7.string
td8 = td_list[7]
item['assessment_score'] = td8.string
td9 = td_list[8]
item['category'] = td9.string
self.data_list.append(item)
def save_data(self):
with open('train.csv', 'w', encoding='utf-8', newline='') as file_obj:
DictW = csv.DictWriter(file_obj, self.head)
DictW.writeheader()
DictW.writerows(self.data_list)
def main(self):
url = './2.Train.html'
html = self.get_html(url)
self.parse_data(html)
self.save_data()
if __name__ == '__main__':
pq = Html_pq()
pq.main()