最近由于自己研究需要,写了爬取华尔街日报的爬虫代码。核心是通过selenium并配置缓存文件进行抓取。
为了避免潜在的法律和版权风险,此贴仅供交流学习使用。
导入包
导入的有点多,有一些包是之前的一些尝试,不完全会用到。可以看自己需要导入。
关于selenium配置的相关问题,可以参看上篇文章。
import pyppeteer
import asyncio
import json
from pyppeteer import launch
import nest_asyncio
from pyppeteer.dialog import Dialog
from types import SimpleNamespace
from pyppeteer.connection import CDPSession
import time
from lxml import etree
import csv
import re
from tqdm import tqdm
import requests
import pandas as pd
import unicodedata
from string import punctuation
import requests
from requests.exceptions import TooManyRedirects
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from string import punctuation
nest_asyncio.apply()
#print(pyppeteer.__chromium_revision__) # 查看版本号
#print(pyppeteer.executablePath())
import os
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
import random
import threading
from threading import Thread
获取COOKIES的几种方法
第一种方法:通过pyppeteer。
async def main():
browser = await launch({'headless': False, 'args': ['--no-sandbox'], 'dumpio': True})
page = await browser.newPage()
await page.setViewport(viewport={'width': 1280, 'height': 800})
await page.waitFor(2000)
await page.goto('填入华尔街日报登入网址')
await page.type('#username',"填入账号")
await page.click("#basic-login > div:nth-child(1) > form > div:nth-child(2) > div:nth-child(6) > div.sign-in.hide-if-one-time-linking > button.solid-button.continue-submit.new-design > span")
await page.waitFor(2000)
await page.type('#password','填入密码')
await page.click("#password-login > div > form > div > div:nth-child(5) > div.sign-in.hide-if-one-time-linking > button")
await page.waitFor(2000)
await asyncio.sleep(30)
await page.evaluate( '''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
page2 = await browser.newPage()
await page2.setViewport(viewport={'width': 1280, 'height': 600})
await page2.waitFor(1000)
await page2.goto('https://www.wsj.com/?mod=mh_header')
await page2.waitFor(3000)
await asyncio.sleep(60)
#手动点击切入跳出小框,同意获取cookies
orcookies=await page2.cookies()
print (orcookies)
cookies = {}
for item in orcookies:
cookies[item['name']] = item['value']
with open("这里输入自己的路径.txt", "w") as f:
f.write(json.dumps(cookies))
await page2.evaluate( '''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
asyncio.get_event_loop().run_until_complete(main())
第二种方法:通过webdriver。
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
browser = webdriver.Chrome(executable_path = '/opt/anaconda3/bin/chromedriver')
#browser.add_argument('-headless')
un = '填入账号'
pw = '填入密码'
browser.get("填入华尔街日报登入网址")
browser.find_element(By.XPATH,"//div/input[@name = 'username']").send_keys(un)
browser.find_element(By.XPATH,'//*[@id="basic-login"]/div[1]/form/div[2]/div[6]/div[1]/button[2]').click()
time.sleep(10)
browser.find_element(By.ID,"password-login-password").send_keys(pw)
browser.find_element(By.XPATH,'//*[@id="password-login"]/div/form/div/div[5]/div[1]/button').click()
time.sleep(10)
# 切换到跳出的小框
# driver.switch_to_frame("sp_message_iframe_490357")
# 点击接受收集 Cookies
# driver.find_element_by_xpath("//button[@title='YES, I AGREE']").click()
# time.sleep(5)
orcookies = browser.get_cookies()
print(orcookies)
cookies = {}
for item in orcookies:
cookies[item['name']] = item['value']
with open("这里输入自己的路径.txt", "w") as f:
f.write(json.dumps(cookies))
第三种方法:直接获取。
#可以直接用开发者程序或者插件获取cookies
#谷歌浏览器在需要获取Cookie的界面,按Ctrl+Shift+j打开js控制台
#输入 console.log(document.cookie) 回车打印Cookies
#定义一个可以直接复制网页cookies后清洗的函数
def cookie_clean(cookie):
a=[]
b=[]
for item in cookie.split(';'):
item=''.join(item.split())
place=item.find('=')
a.append(item[:place])
b.append(item[place+1:])
cookies=dict(zip(a,b))
return cookies
获取文章链接
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
"content-type": "application/json; charset=UTF-8",
"Connection": "keep-alive"
} # 需要替换为自己的请求头
f= open("这是一个日期的列表文件的路径.txt", "r",encoding="cp936")
# 需要自己事先创建一个简单的日期列表文件,格式如:1998/01/01,具体囊括多少日期视爬取需求而定
line = f.readlines()
for line in line[8596:]: # 如果链接发生变化,随时切片再次爬取
date = line.strip().split("\t")[0]
with open("这是cookies的路径.txt", "r")as g:
cookies = g.read()
cookies = json.loads(cookies)
session = requests.session()
url = "https://www.wsj.com/news/archive/" + date
data = session.get(url, headers=headers, cookies = cookies)
time.sleep(1)
try:
soup = BeautifulSoup(data.content, 'html.parser')
urls = [i.a['href'] for i in soup.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
articles = [i.text for i in soup.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
articles = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n"," ").replace('\t',"") for i in articles if len(i)>=1]
modu=soup.find_all('div', {'class':'WSJTheme--overflow-hidden--qJmlzHgO'})
categories = [i.find('div').text.strip().split("\t")[0] for i in modu]
categories = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n","").replace('\t',"") for i in categories if len(i)>=0]
page_num = int(soup.find('span', {'class': "WSJTheme--pagepicker-total--Kl350I1l"}).text.strip().replace('of ', ''))
with open("这里输入自己的路径.txt",'a') as j:
for k, i in enumerate(categories):
j.write(articles[k]+'\t'+ urls[k]+'\t'+categories[k]+'\t'+ date + '\n')
if page_num == 1:
print("pn=1")
else: #翻页
for pn in range(2, page_num+1):
print(pn)
time.sleep(1)
new_url = url+'?page='+str(pn)
data1 = session.get(new_url, headers=headers, cookies = cookies)
time.sleep(1)
soup1 = BeautifulSoup(data1.content, 'html.parser')
urls1 = [i.a['href'] for i in soup1.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
articles1 = [i.text for i in soup1.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
articles1 = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n"," ").replace('\t',"") for i in articles1 if len(i)>=1]
modu1=soup1.find_all('div', {'class':'WSJTheme--overflow-hidden--qJmlzHgO'})
categories1 = [i.find('div').text.strip().split("\t")[0] for i in modu1]
categories1 = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n","").replace('\t',"") for i in categories1 if len(i)>=0]
with open("这里输入自己的路径.txt",'a') as j:
for k, i in enumerate(categories1):
j.write(articles1[k]+'\t'+ urls1[k]+'\t'+categories1[k]+'\t'+ date + '\n')
except Exception as e: #记录报错信息
print(url, e)
with open("这里输入自己的路径.txt",'a') as l:
l.write(url+'\t'+ date)
pass
爬取文章内容
之前可以直接爬取网站的内容,不需要加载tmp文件。但似乎今年WSJ网站做了一些些更新,所以代码要更复杂一些。
这里先呈现之前的代码。
# 旧的获取内容方法
def get_headers_and_cookies():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3542.0 Safari/537.36',
"content-type": "application/json; charset=UTF-8",
"Connection": "keep-alive"
}
with open("wsjcookies.txt", "r")as f:
cookies = f.read()
cookies = json.loads(cookies)
return headers,cookies
def open_file():
#这里是已经整理过的文章链接
f2= open("文章链接路径.txt", "r",encoding="cp936")
lines = f2.readlines()
return lines
def get_text(url,date,title,headers,cookies):
errorlog = []
try:
# requesting
data = session.get(url, headers=headers, cookies = cookies)
time.sleep(random.random())
soup = BeautifulSoup(data.content, 'html.parser')
# parsing
heads = soup.find('div', {'class':'wsj-article-headline-wrap'}).text
content = soup.find('div', {'class':'article-content'}).text
content = heads + content
# saving
with open ('输入存储路径/%s_%s.txt'%(date, title),'w',encoding='utf-8',errors='ignore') as j:
j.write(content)
j.close()
except Exception as e:
print(url, e)
with open ('输入存储路径/%s_%s.txt'%(date, title),'w',encoding='utf-8',errors='ignore') as j:
j.write(title)
j.close()
errorlog.append([url, e])
pass
def main():
lines = open_file()
headers,cookies= get_headers_and_cookies()
session = requests.session()
for line in lines:
linenew = line.split('\t')
title = linenew[0].replace('/','-')
url = linenew[1]
category = linenew[2]
date = linenew[3].replace('\n','').replace('/','-')
get_text(url,date,title,headers,cookies)
if __name__ == '__main__':
main()
现在呈现更新版的代码。
# 首先加载配置文件,具体加载方法见上一篇分享
option = webdriver.ChromeOptions()
option.add_argument(r"user-data-dir=配置文件路径/tmp")# 加载配置文件夹,直接command+shift+g 查找文件路径
driver = webdriver.Chrome(options=option)
class crawling():
def __init__(self,driver,num1,num2,file):
self.lines = open("这里是文章列表的文件.txt", "r",encoding="cp936").readlines()
self.driver = driver
self.num1 = num1
self.num2 = num2
self.file = file
def get_article(self):
errorlog = []
driver = self.driver
url = self.url
date = self.date
title = self.title
newlink = url.split('/')[-1].split('.')[0].replace('?',"---")
if len(newlink) <= 200:
try:
driver.get(url)
time.sleep(8)
titlename = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/div[1]/div')
writername = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/div[1]/div')
article = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/section')
content = titlename.text + '\n' + writername.text + '\n' + article.text
time.sleep(1)
with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink),'w',encoding='utf-8',errors='ignore') as j:
j.write(content)
j.close()
except Exception as e:
#print(url, e)
with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink),'w',encoding='utf-8',errors='ignore') as j:
j.write(title)
j.close()
errorlog.append([url, e])
pass
else:
newlink2 = newlink[:200] # 规避过长的文件名
try:
driver.get(url)
time.sleep(8)
titlename = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/div[1]/div')
writername = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/div[1]/div')
article = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/section')
content = titlename.text + '\n' + writername.text + '\n' + article.text + newlink
time.sleep(1)
with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink2),'w',encoding='utf-8',errors='ignore') as j:
j.write(content)
j.close()
except Exception as e:
title2 = title + '\n' + newlink
#print(url, e)
with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink2),'w',encoding='utf-8',errors='ignore') as j:
j.write(title2)
j.close()
errorlog.append([url, e])
pass
def main(self):
for line in self.lines[self.num1:self.num2]:
linenew = line.split('\t')
self.title = linenew[0].replace('/','-')
self.url = linenew[1]
self.category = linenew[2]
self.date = linenew[3].replace('\n','').replace('/','-')
self.get_article()
#多线程运行
def muti_crawling(path,n1,n2,file):
option = webdriver.ChromeOptions()
option.add_argument(path)
driver = webdriver.Chrome(options=option)
a = crawling(driver,n1,n2,file)
a.main()
if __name__ == '__main__':
t1 = Thread(target=muti_crawling,args=(r'配置文件路径/tmp',0,45945,'1998')) #前两个数字为起止链接的位序,后一个数字代表年份
t2 = Thread(target=muti_crawling,args=(r'配置文件路径/tmp2',45945,94758,'1999'))
# 多线程,随时调整
# 无文字新闻以及网站404不能爬取
# 每个线程分不同文档分别储存
t1.start()
t2.start()
# 定位分割点
li = []
lines = open("文章列表路径.txt", "r",encoding="cp936").readlines()
for line in lines:
linenew = line.split('\t')
date = linenew[3].replace('\n','').replace('/','-')
li.append(date)
index = li.index('2004-01-01')
index
以上。