# -*- coding=utf-8 -*-
import time
import datetime
import requests
import re
import os
import random
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, ReadTimeout
##
headers2 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
cookies = {
'Cookie': 'ASP.NET_SessionId=04kvxfczofgsnqanxayw5twy; SID_navi=120162; cnkiUserKey=6c764210-5480-5645-eccc-02ec772592d7; Ecp_ClientId=1191218194104202263; Ecp_IpLoginFail=191218111.167.199.201; _pk_ses=*'}
param = {
'Accept': 'text/html, */*; q=0.01',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep - alive',
'Content - Type': 'application / x - www - form - urlencoded;charset = UTF - 8',
'Host': 'yuanjian.cnki.net',
'Origin': 'http://navi.cnki.net',
'Referer': 'http://navi.cnki.net/KNavi/JournalDetail?pcode=CJFD&pykm=YISY',
'X - Requested - With': 'XMLHttpRequest'}
formdata = {'pykm': 'YISY',
'pcode': 'CJFD',
'pageIdx': '0',
'type': '2'}
proxy = {
'http':'http://117.68.145.187:4528',
'https':'https://117.68.145.187:4528'
}
def getURL(url): #发送网络请求
a = requests.post(url, data=formdata, headers=headers2, cookies=cookies, proxies=proxy) #使用代理
#a = requests.post(url, data=formdata, headers=headers2, cookies=cookies) #不使用代理
# a=requests.post(url, data=formdata, headers=headers2, cookies=cookies, params=param)
#a.encoding = 'utf-8' #改变乱码问题
html = a.text
return html
def doDown():
#循环读取连接:
with open("c.txt", 'r') as file:
for url in file:
try:
print(url[-10:-6],'年','第',url[-6:-4],'第',url[-3:],'篇') # 截取倒数第三位与倒数第一位之前的字符
print('当前处理的url为:',url)
#请求知网
#html = getURL(url)
soup = BeautifulSoup(getURL(url), 'html.parser')
#print(soup)
soup2 = soup.find('div', class_='wxBaseinfo').find_all('p')[0]
title = soup.find('title').text
gaiyao = soup2.findAll('span')[0].contents[0]
#print(soup.find('title').text) # 标题
#print(soup2.findAll('span')[0].contents[0]) # 概要
with open("d.txt", 'a') as file:
#file.write('\n' + url[-10:-6]+'年'+'第'+url[-6:-4]+'第'+url[-3:])
#file.write('\n' + '标题:' + title)
file.write('\n' + '摘要:' + gaiyao)
file.write('\n')
file.close()
time.sleep(0.5)
except Exception as ex:
print("出现如下异常%s"%ex,url)
continue
if __name__ == '__main__':
doDown()
Python通过代理爬取知网数据
最新推荐文章于 2024-03-15 17:00:00 发布