使用python脚本抓取ZW论文数据demo
业务场景:因为项目的数据比较少,所以通过第三方渠道抓取数据(数据收集 )
使用
一、准备工作
- 搭建环境
语言:python
python版本:3.9
工具:PyCharm
- 导入python包
使用到的库:
- Requests:用于发送 HTTP 请求√
- BeautifulSoup:用于解析 HTML√
- pmysql:
- JsonPath:处理json数据
- Scrapy:一个功能强大的爬虫框架 ×
安装
使用 pip 安装所需库:
注意:安装慢需要设置镜像
pip install requests beautifulsoup4 selenium mysql
二、测试案例
注意:修改自己的mysql密码和账号
import time
import requests
import re
import pymysql
from bs4 import BeautifulSoup
from operatorKeyWords import operate_search_keywords
from db_config import read_db_config
# 1.模拟浏览器获取数据
# 2.将获取的数据做处理
# 3.最后直接写到数据库中
# 处理字符串格式
def remove_tags(text):
# 确保输入是字符串类型
if not isinstance(text, str):
text = str(text)
# 使用正则表达式来匹配并移除 < > 之间的内容,包括标签本身及中间的空格和字符
def replacer(match):
# 获取标签内容并去除所有空格
content = match.group(1).replace(' ', '')
return content
# 使用正则表达式找到所有 < > 之间的内容,并调用 replacer 函数进行替换
clean_text = re.sub(r'<[^>]*>(.*?)</[^>]*>', replacer, text)
# 移除所有不包含内容的尖括号标签
clean_text = re.sub(r'<[^>]*>', '', clean_text)
return clean_text
# 处理字符串格式
def remove_symbols_from_string(input_list):
# 将列表转换为字符串
list_as_string = str(input_list)
# 去除指定的符号
cleaned_string = list_as_string.replace('[', '').replace(']', '').replace("'", '').replace(' ', '')
return cleaned_string
# 去除多余的空格
def remove_spaces(input_string):
# 使用replace方法去除所有空格
cleaned_string = input_string.replace(" ", "")
return cleaned_string
def get_content(page,keyword):
url = 'https://kns.cnki.net/kns8s/brief/grid'
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Length': '721',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'UM_distinctid=190586112fb223-07b485938bd306-26021e51-1fa400-190586112fc6f0; Ecp_ClientId=l240627142800353297; Ecp_ClientIp=123.14.254.235; knscanary=never; Ecp_IpLoginFail=24070261.52.73.77; SID_sug=018132; docmrcanary=always; SID_kns_new=kns15128001; KNS2COOKIE=1719887682.994.36867.634651|b25e41a932fd162af3b8c5cff4059fc3; tfstk=fh5mLYMmHtJbBU0Tm_AbCNAXR_e-hmO62Gh9XCKaU3-WMdKA7hXG0ivwBrtOqljl5clvglGiIGQH6fpTlhlNVw-VkNEssVRfwth9DoOG7MdgvkFL9Z_XGCrLvlRfqAAeSmkZXd8y4g5db6FL9Z_V94XG0WU6fc26XhRw3hkra3Tt_mSw3a8yVFTZ0Fl4ra-W7CoZ_CPyUe8K0fl1Af-h__1zmiS7DjIQd_TDnhc94fXhaXp2YCKr_l6koc-Fu3cZ_nT2Nc7lkocCyI1cxeIYajjhR_skzMPgqCs5QG8G4Wi9G9IfOLjYLb7yN3Jez6ztd3CkDG5DUyhHBQ5pLIW8Fop9iejNGi0i_M7RJs95skmyftLBaeQbibAFuFSrYYkFgkG6zPCrCA92PUxQf5EA29fU-5aurvwX3UTYvz4oCA92PUxLrzDIcK8WkkC..; dblang=both',
'Host': 'kns.cnki.net',
'Origin': 'https://kns.cnki.net',
'Pragma': 'no-cache',
'Referer': 'https://kns.cnki.net/kns8s/defaultresult/index?classid=YSTT4HG0&korder=SU&kw=%E5%86%9C%E5%9E%A6',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
data = {
'boolSearch': 'false',
'QueryJson': '{"Platform":"","Resource":"JOURNAL","Classid":"YSTT4HG0","Products":"","QNode":{"QGroup":[{"Key":"Subject","Title":"","Logic":0,"Items":[{"Field":"SU","Value":"'+keyword+'","Operator":"TOPRANK","Logic":0}],"ChildItems":[]}]},"ExScope":1,"SearchType":7,"Rlang":"CHINESE","KuaKuCode":""}',
'pageNum': page,
'pageSize': '20',
'sortField': 'PT',
'sortType': 'desc',
'dstyle': 'listmode',
# 'productStr': 'YSTT4HG0,LSTPFY1C,RMJLXHZ3,JQIRZIYA,JUP3MUPD,1UR4K4HZ,BPBAFJ5S,R79MZMCB,MPMFIG1A,EMRPGLPA,J708GVCE,ML4DRIDX,WQ0UVIAA,NB3BWEHK,XVLO76FD,HR1YT1Z9,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R,',
'boolSortSearch': 'false',
'aside': '',
'searchFrom': '资源范围:学术期刊',
}
response = requests.post(url, headers=headers, data=data)
# 初始化一个列表来存储提取的信息
articles = []
# 检查响应状态
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
# 找到所有包含文章信息的<tr>标签
rows = soup.find_all('tr')
for row in rows:
article = {}
# 提取标题
title_element = row.find('td', class_='name')
if title_element:
title_link = title_element.find('a')
if title_link:
article['title'] = title_link.get_text(strip=True)
# 提取作者
author_elements = row.find_all('a', class_='KnowledgeNetLink')
if author_elements:
article['authors'] = ', '.join([author.get_text(strip=True) for author in author_elements])
# 提取来源
source_element = row.find('td', class_='source')
if source_element:
source_link = source_element.find('a')
if source_link:
article['source'] = source_link.get_text(strip=True)
# 提取发表时间
date_element = row.find('td', class_='date')
if date_element:
article['date'] = date_element.get_text(strip=True)
# 如果提取到的所有信息非空,则添加到文章列表中
if article:
articles.append(article)
print(articles)
else:
print(f"请求失败,状态码: {response.status_code}")
# time.sleep(10)
return articles
def inputDB(dataList):
# # 建立数据库连接
# conn = pymysql.connect(
# host='localhost', # 主机名(或IP地址)
# port=3306, # 端口号,默认为3306
# user='root', # 用户名
# password='123456', # 密码
# charset='utf8mb4' # 设置字符编码
# )
# 读取数据库配置信息
db_config = read_db_config()
# 建立数据库连接
conn = pymysql.connect(
host=db_config['host'],
port=int(db_config['port']),
user=db_config['user'],
password=db_config['password'],
database=db_config['database'],
charset=db_config['charset']
)
# 创建游标对象
cursor = conn.cursor()
# 选择数据库 promise office_name content publish_date
# conn.select_db("nk")
for object in dataList:
query = "SELECT * FROM srp_result_temp_thesis WHERE name = %s"
condition_value = object.get('title', 'N/A')
cursor.execute(query, condition_value)
# 获取所有结果
result = cursor.fetchall()
if len(result) == 0:
insert_query = """
INSERT INTO srp_result_temp_thesis (name, first_author, periodical_name, publish_date,create_time, source,temp_status)
VALUES (%s, %s, %s, %s,%s, %s, %s);
"""
# 数据
values = (
object.get('title', ''),
remove_spaces(object.get('authors', '')).replace(",","、"),
object.get('source', ''),
object.get('date', ''),
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())),
1,
'待处理',
)
cursor.execute(insert_query, values)
# 提交事务
conn.commit()
# 程序的入口 run
if __name__ == '__main__':
keyword = operate_search_keywords(1,1)
print('搜索来源:知网---论文数据----搜索关键字为:'+keyword)
pages = 10
for page in range(pages):
print('第'+str(page+1)+'次请求数据')
# 1.模拟浏览器获取数据
data = get_content(page+1,keyword)
# 2.将获取的数据做处理,插入数据库
# dataList = handleData(jsonData)
# 3.最后直接写到数据库中
inputDB(data)
time.sleep(10 + page * 2) # 增加间隔时间
注意点:
- 关键字搜索:wd=农垦
- 类型的搜索:filter=sc_type%3D%7B5%7D
- headers:请求头
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'PSTM=1720075027; BAIDUID=CB639625F1F69C58EDE0D264F98E652F:FG=1; BAIDUID_BFESS=CB639625F1F69C58EDE0D264F98E652F:FG=1; BIDUPSID=AF8236A49BC0C5D6D81309D9EA8BBBE3; ZFY=RiwOIYYk3OuqPFhFb7b66hcDduGAcmFIv808hRliBq8:C; Hm_lvt_f28578486a5410f35e6fbd0da5361e5f=1719829940,1719883502,1719976333,1720143163; H_PS_PSSID=60236_60366; BA_HECTOR=a5a5808kag812l2g24a084ah1ju05v1j8i5jp1u; delPer=0; BD_CK_SAM=1; PSINO=1; antispam_key_id=45; antispam_site=ae_xueshu_paper; BDRCVFR[w2jhEs_Zudc]=mbxnW11j9Dfmh7GuZR8mvqV; ariaDefaultTheme=undefined; ab_sr=1.0.1_NWY0N2MxMDM1OTM3ZTAxMjgxMDEwNmFmZDI0MGYwNjI3YWEyNjYxZDMwMTEyMWE2NWYzOTgwZjhlNjhmYTc2ZGFhYmNhNzI3YmJhMzA3NzJiNDM5YWM1NWUyOGUzNDFjMWJjZDM2YWQ1ZmUwZDYwZTQzNjJhMjMwMDE5MmMyOWEwMzUyODNmZjJiYWViYzBjOWNiMjFkYjAyMWU1NjUyNzQxNGEwNjJiYzE2M2JjMWVlODRhNzc3YzcyNzAzZDE4YTM3Y2U3ZGFhYTY0MTEwMjYyZTFjZWNlOWY5NDI2OTc=; antispam_data=bf07958fd14f9cbd1417daf18917863e20e0b5924cffc16bebea9e107bace006f6632c31eef7d962db39ef909d4a535ccaedb50e11726f9c42fc2750296641ada6216ec7a17fcb98adcbf40c846c6fcb9baadb34b53ab616b60ca32ad2f6ac906dc1dcce1441a5433db48b53413ef3ce; antispam_sign=0d6437a5; BDSVRTM=404',
'Host': 'xueshu.baidu.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3880.400 QQBrowser/10.8.4554.400 ",
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Microsoft Edge";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
注意点:
- Accept-Encoding: 服务器会根据这个头信息对响应进行适当的编码 (需要注释掉,不然会乱码)
- User-Agent:这个是模拟浏览器的重要元素
- verify=False: 忽略 SSL 证书验证