python beautifulsoap select

最新推荐文章于 2023-03-30 23:35:12 发布

小小毛毛虫~

最新推荐文章于 2023-03-30 23:35:12 发布

阅读量234

点赞数

分类专栏：开发基本知识

本文链接：https://blog.csdn.net/Lyncai/article/details/115243595

版权

开发基本知识专栏收录该内容

181 篇文章 1 订阅

订阅专栏

import urllib.request
import requests
from bs4 import BeautifulSoup
#from urllib.parse import quote

"""
爬取静态网页 https://www.ccf.org.cn/Academic_Evaluation/AI/
比较request 和 requests不同使用
以及中文字符乱码的处理
"""

def fetch_data_byrequest(url):
	#req = requests.get(url)
	#html = req.text
	#return html
	req=urllib.request.urlopen(url)
	content=req.read().decode()
	return content

def fetch_data_byrequests(url):
	req = requests.get(url)
	content = req.text.encode(req.encoding).decode(req.apparent_encoding)
	return content

def parse_by_soap(content,selectors,split_char=""):
	result =[]

	soap = BeautifulSoup(content,'html.parser')
	for sele in selectors:
		all_blocks = soap.select(sele)
		for b in all_blocks:
			one_item=split_char.join(b.get_text().split())
			result.append(one_item)
	# for e in ele:
	# 	#print(e)		 
	# 	result.append(split_char.join(e.get_text().split()))
	return result

def output2file(file_name,content):
	f = open(file_name,"a",encoding="utf-8")
	print(content,file=f)
	f.close()



if __name__=="__main__":
	url = "https://www.ccf.org.cn/Academic_Evaluation/AI/"
	content_req = fetch_data_byrequest(url)
	content_reqs=fetch_data_byrequests(url)


	# parse file 	
	selector_magzine_a="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(4)"
	selector_magzin_b="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(6)"
	selector_magzine_c="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(8)"
	selector_conf_a="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(12)"
	selector_conf_b="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(14)"
	selector_conf_c="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(16)"

	#selector_test="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(4) > li"
	#ele=soap.select(selector_test)

	selectors=[selector_magzine_a+" > li",
				selector_magzin_b+" > li",
				selector_magzine_c+" > li",
				selector_conf_a + " > li",
				selector_conf_b + " > li",
				selector_conf_c + " > li"]
	split_char=';'

	result=parse_by_soap(content_req,selectors,split_char)
	for item in result:
		output2file("result.txt",item)

	
	# 输出到文件
	#output2file("request_content.html",content_req)
	#output2file("requests_content.html",content_reqs)

抓到的文件：