爬取X-MOL,代码如下
name和password是你自己的账号密码,希望大佬指正
```python
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urlencode
from selenium import webdriver
import time
#导入动作练
from selenium.webdriver import ActionChains
import time
def se_login():
options = webdriver.FirefoxOptions()
#设置无头化过程
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.binary_location = "D://火狐浏览器//firefox.exe"
browser = webdriver.Firefox(executable_path = 'E://Python37//geckodriver.exe', options = options)
#time.sleep(3)
browser.get('https://www.x-mol.com/login')
username = browser.find_element_by_id('username')
username.send_keys('name')
password = browser.find_element_by_id('password')
password.send_keys('password')
btn = browser.find_element_by_xpath('/html/body/div[2]/div[2]/div/div/div[2]/div[3]/form/div[4]/div/input')
btn.click()
global cookie
cookie = browser.get_cookies()
#jsoncookies = json.dumps(cookie)#将cokie转化为字符串才可以保存为text文本中
#with open('C:\\Users\\YJS\\Desktop\\123.txt', 'w') as f:
# f.write(jsoncookies)
#print(type(cookie),'\n',cookie)
global cookiestr
list1 = []
for item in cookie:
list1.append(item['name'] + '=' + item['value'] + ';')
#print(list1)
cookiestr = ''.join(list1)
#print(cookiestr)
browser.quit()
#模拟登陆不成功,我直接在网页上获取内容
base_url = "https://www.x-mol.com/paper/search/q?"
def get_text(search, pageindex):
data = {'option':search, 'pageIndex':pageindex}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
'Referer':'https://www.x-mol.com/',
'Host':'www.x-mol.com',
'Cookie':cookiestr
}
url = base_url + urlencode(data)
try:
r = requests.get(url, headers = headers)
if r.status_code == 200:
print('\n','要检索的网址为:', r.url)
time.sleep(2)
print(r.status_code)
return r.text
except Exception as e:
print(str(e))
def ana_page(html):
soup = BeautifulSoup(html,'lxml')
items = soup.find_all(name = 'div',attrs = {'class':'magazine-senior-search-results-list-right'})
for item in items:
#print(item)
title = item.find(name = 'div', attrs = {'class':'it-bold space-bottom-m10'}).text.strip()#.string只能回去一个tag节点的文本值,节点多了就不知道是哪一个节点了,就会返回None
periodical = item.find(name = "em", attrs = {'class':'it-blue'}).text
#print(periodical)
global IF
IF = item.find(name = 'span', attrs = {'style':'color: #FF7010;'}).text
author = item.find_all(name = 'div', attrs = {'class':'div-text-line-one it-new-gary'})[1].text.strip()
#print(author)
#all = item.find(name = 'div', attrs = {'class':'div-text-line-one it-new-gary'}).text
all = item.find(name = 'span', attrs = {'style':'color: #FF7010;'}).next_sibling.string
(date,DOI) = all.split(',')
DOI = DOI.split(':')
date = date.split(':')[1].strip()
yield {
'title':title,
'date':date,
'periodical':periodical,
'IF':IF,
'DOI':DOI[1].strip(),
'author':author
}
def main():
if judge == "y":
for pageindex in range(1,int(Range) + 1):
html = get_text(search, pageindex)
generator = ana_page(html)
for g in generator:
if float(g['IF']) >= 10.0:
print(g)
else:
for pageindex in range(1,int(Range) + 1):
html = get_text(search, pageindex)
generator = ana_page(html)
for g in generator:
print(g)
#if get_text()
se_login()
while True:
search = input('请输入要搜索的内容:')
Range = input('请输入检索页面数量:')
judge = input('是否只输出IF>= 10的文献:')
main()