爬取万方论文数据基本信息
代码1
from selenium import webdriver
from lxml import etree
import time
import re
import pandas as pd
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
url_first = 'http://s.wanfangdata.com.cn/paper?q='
url=url_first+input('请输入您想搜索的关键词:')
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'
}
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
brower = webdriver.Chrome(executable_path='./chromedriver.exe',options=chrome_options)
brower.get(url)
all_content_counts_real = brower.find_element_by_xpath('/html/body/div[3]/div/div[2]/div[2]/div/div[3]/div[2]/div[2]/span[2]').text
all_content_counts = re.findall('\D*([0-9]*)\D*',all_content_counts_real)[0]
page_counts = int(all_content_counts)//20
if page_counts>=10:
page_counts=10
else:
pass
first_code ='/html/body/div[3]/div/div[2]/div[2]/div/div[3]/div[2]/div[3]/div['
last_code_name = ']/div/div[1]/div[2]/a'
last_code_ele = ']/div/div[2]'
last_code_abs = ']/div/div[3]'
database=[]
for j in range(1,3):
if j <= 2:
for i in range(1, 21):
name_xpath = ('%s%d%s' % (first_code, i, last_code_name))
ele_xpath = ('%s%d%s' % (first_code, i, last_code_ele))
abs_xpath = ('%s%d%s' % (first_code, i, last_code_abs))
name = brower.find_element_by_xpath(name_xpath)
ele = brower.find_element_by_xpath(ele_xpath)
abs = brower.find_element_by_xpath(abs_xpath)
name_url = name.get_attribute("href")
print(name.text, ele.text, abs.text, name_url)
database.append([name.text, ele.text,