python beautifulsoap select

 

import urllib.request
import requests
from bs4 import BeautifulSoup
#from urllib.parse import quote

"""
爬取静态网页 https://www.ccf.org.cn/Academic_Evaluation/AI/
比较request 和 requests不同使用
以及中文字符乱码的处理
"""

def fetch_data_byrequest(url):
	#req = requests.get(url)
	#html = req.text
	#return html
	req=urllib.request.urlopen(url)
	content=req.read().decode()
	return content

def fetch_data_byrequests(url):
	req = requests.get(url)
	content = req.text.encode(req.encoding).decode(req.apparent_encoding)
	return content

def parse_by_soap(content,selectors,split_char=""):
	result =[]

	soap = BeautifulSoup(content,'html.parser')
	for sele in selectors:
		all_blocks = soap.select(sele)
		for b in all_blocks:
			one_item=split_char.join(b.get_text().split())
			result.append(one_item)
	# for e in ele:
	# 	#print(e)		 
	# 	result.append(split_char.join(e.get_text().split()))
	return result

def output2file(file_name,content):
	f = open(file_name,"a",encoding="utf-8")
	print(content,file=f)
	f.close()



if __name__=="__main__":
	url = "https://www.ccf.org.cn/Academic_Evaluation/AI/"
	content_req = fetch_data_byrequest(url)
	content_reqs=fetch_data_byrequests(url)


	# parse file 	
	selector_magzine_a="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(4)"
	selector_magzin_b="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(6)"
	selector_magzine_c="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(8)"
	selector_conf_a="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(12)"
	selector_conf_b="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(14)"
	selector_conf_c="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(16)"

	#selector_test="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(4) > li"
	#ele=soap.select(selector_test)

	selectors=[selector_magzine_a+" > li",
				selector_magzin_b+" > li",
				selector_magzine_c+" > li",
				selector_conf_a + " > li",
				selector_conf_b + " > li",
				selector_conf_c + " > li"]
	split_char=';'

	result=parse_by_soap(content_req,selectors,split_char)
	for item in result:
		output2file("result.txt",item)

	
	# 输出到文件
	#output2file("request_content.html",content_req)
	#output2file("requests_content.html",content_reqs)

抓到的文件:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值