普通学生,想爬取那些无法获得cookier的网站数据,然后从其他文章那里学到如何使用selelium,感觉很有用,就记录下来,防止以后忘记.
# for item in list_title:
# key = urllib.parse.quote(item)
# url = "https://so.eastmoney.com/news/s?keyword=" + str(key) # 对不同关键词的网址进行构造
###########################列表数据
from lxml import etree
from selenium import webdriver
import time
import openpyxl as op
from openpyxl import load_workbook
def create_excel(save_location):#创建一个excel表格
ws = op.Workbook()
wb = ws.create_sheet(index=0)
wb.cell(row=1, column=1, value='序号')
wb.cell(row=1, column=2, value='时间')
wb.cell(row=1, column=3, value='题目')
wb.cell(row=1, column=3, value='内容')
site = "D:\\金融数据集_"+save_location+".xlsx"
ws.save(site)
return 0
def write_data(result,seq,save_location):#直接调用该函数将内容写入其中,seq计数
site = "D:\\金融数据集_" + save_location + ".xlsx"
wb = load_workbook(site)
# 激活excel表
sheet = wb.active
# 向excel中写入对应的value
sheet.cell(row=seq, column=1).value = seq
sheet.cell(row=seq, column=2).value = result['date']
sheet.cell(row=seq, column=3).value = result['title']
sheet.cell(row=seq, column=4).value = result['content']
print("数据"+str(seq)+"写入成功")
wb.save(site)#该方法会覆盖原有内容
return 0
def crycle(url,seq,save_location):#输入网址和初始的seq,搜索一个关键字的问题内容
browser = webdriver.Chrome()
# 输入网址,已经有关键词了
browser.get(url)
browser.encoding = "utf-8"
create_excel(save_location)
for page in range(50):
print("第" + str(
page + 1) + "页------------------------------------------------------------------------------------------")
for i in range(10):
location_time = "/html/body/div[1]/div[3]/div[1]/div[3]/div[" + str(i + 1) + "]/div[2]/span[1]" # 时间定位
location_title = "/html/body/div[1]/div[3]/div[1]/div[3]/div[" + str(i + 1) + "]/div[1]/a" # 标题定位
location_content = "/html/body/div[1]/div[3]/div[1]/div[3]/div[" + str(i + 1) + "]/div[2]/span[2]" # 内容定位
location_url = "/html/body/div[1]/div[3]/div[1]/div[3]/div[" + str(i + 1) + "]/div[1]/a" # 定位url,加一个摘要判定怎么样?
#Unable to locate element: {"method":"xpath","selector":"/html/body/div[1]/div[3]/div[1]/div[3]/div[1]/div[2]/span[1]"},加一个无定位输入为0,继续下面操作
date = browser.find_element_by_xpath(location_time).text # 获取时间
title = browser.find_element_by_xpath(location_title).text # 获取题目,利用上面的方法,把时间也拿出来
content = browser.find_element_by_xpath(location_content).text # 获取内容,内容可以用句号把它分成完整的话?
url = browser.find_element_by_xpath(location_url).get_attribute("href") # 要从这一步找到url,之后进入页面
time.sleep(1)
result = {'date': date, 'title': title, 'content': content}
seq += 1
write_data(result, seq,save_location)
# print(title) # str类型数据
browser.find_element_by_xpath('/html/body/div[1]/div[3]/div[1]/div[4]/div/a[5]').click() # 翻页,复制的元素为full xpath
time.sleep(3)
使用selelium需要前置条件,不能直接使用,但是可以绕过网站的一些反爬虫程序,对于小数据收集很有效果.(十万以内)