使用Python爬取华尔街日报(WALL STREET JOURNAL)全文

本文介绍了使用Python的Selenium和pyppeteer库来登录华尔街日报并抓取页面cookies的方法。通过这些cookies,可以访问受保护的文章链接。文章详细展示了三种不同的cookies获取方式,并提供了爬取和解析文章标题、链接及分类的代码示例。此外,还讨论了如何处理网站更新导致的爬虫挑战。
摘要由CSDN通过智能技术生成

最近由于自己研究需要,写了爬取华尔街日报的爬虫代码。核心是通过selenium并配置缓存文件进行抓取。

为了避免潜在的法律和版权风险,此贴仅供交流学习使用。

导入包

导入的有点多,有一些包是之前的一些尝试,不完全会用到。可以看自己需要导入。

关于selenium配置的相关问题,可以参看上篇文章。

import pyppeteer
import asyncio
import json
from pyppeteer import launch
import nest_asyncio
from pyppeteer.dialog import Dialog
from types import SimpleNamespace
from pyppeteer.connection import CDPSession
import time
from lxml import etree
import csv
import re
from tqdm import tqdm
import requests
import pandas as pd
import unicodedata
from string import punctuation
import requests
from requests.exceptions import TooManyRedirects
from bs4 import BeautifulSoup 
from pyquery import PyQuery as pq
from string import punctuation
nest_asyncio.apply()
#print(pyppeteer.__chromium_revision__)  # 查看版本号
#print(pyppeteer.executablePath())
import os
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
import random
import threading
from threading import Thread

获取COOKIES的几种方法

第一种方法:通过pyppeteer。

async def main():
    browser = await launch({'headless': False, 'args': ['--no-sandbox'], 'dumpio': True})
    page = await browser.newPage()
    await page.setViewport(viewport={'width': 1280, 'height': 800})
    
    await page.waitFor(2000) 
    await page.goto('填入华尔街日报登入网址')
    await page.type('#username',"填入账号")
    await page.click("#basic-login > div:nth-child(1) > form > div:nth-child(2) > div:nth-child(6) > div.sign-in.hide-if-one-time-linking > button.solid-button.continue-submit.new-design > span")
    
    await page.waitFor(2000) 
    await page.type('#password','填入密码')
    await page.click("#password-login > div > form > div > div:nth-child(5) > div.sign-in.hide-if-one-time-linking > button")
    await page.waitFor(2000) 
    await asyncio.sleep(30)
    await page.evaluate( '''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
    
    
        
    page2 = await browser.newPage()
    await page2.setViewport(viewport={'width': 1280, 'height': 600})
    await page2.waitFor(1000)
    await page2.goto('https://www.wsj.com/?mod=mh_header')
    await page2.waitFor(3000) 
    await asyncio.sleep(60)
    
    #手动点击切入跳出小框,同意获取cookies
    
    orcookies=await page2.cookies()
    print (orcookies)
    cookies = {}
    for item in orcookies:
        cookies[item['name']] = item['value']
    with open("这里输入自己的路径.txt", "w") as f:
        f.write(json.dumps(cookies))
  
    
    await page2.evaluate( '''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
    
    
asyncio.get_event_loop().run_until_complete(main())

第二种方法:通过webdriver。

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json

browser = webdriver.Chrome(executable_path = '/opt/anaconda3/bin/chromedriver')
#browser.add_argument('-headless')

un = '填入账号'
pw = '填入密码'
browser.get("填入华尔街日报登入网址")

browser.find_element(By.XPATH,"//div/input[@name = 'username']").send_keys(un)
browser.find_element(By.XPATH,'//*[@id="basic-login"]/div[1]/form/div[2]/div[6]/div[1]/button[2]').click()

time.sleep(10)

browser.find_element(By.ID,"password-login-password").send_keys(pw)
browser.find_element(By.XPATH,'//*[@id="password-login"]/div/form/div/div[5]/div[1]/button').click()

time.sleep(10)

# 切换到跳出的小框
# driver.switch_to_frame("sp_message_iframe_490357")
# 点击接受收集 Cookies 
# driver.find_element_by_xpath("//button[@title='YES, I AGREE']").click()

# time.sleep(5)

orcookies = browser.get_cookies()
print(orcookies)
cookies = {}
for item in orcookies:
    cookies[item['name']] = item['value']
with open("这里输入自己的路径.txt", "w") as f:
    f.write(json.dumps(cookies))

第三种方法:直接获取。

#可以直接用开发者程序或者插件获取cookies
#谷歌浏览器在需要获取Cookie的界面,按Ctrl+Shift+j打开js控制台
#输入 console.log(document.cookie) 回车打印Cookies
#定义一个可以直接复制网页cookies后清洗的函数
def cookie_clean(cookie):
    a=[]
    b=[]
    for item in cookie.split(';'):
        item=''.join(item.split())
        place=item.find('=')
        a.append(item[:place])
        b.append(item[place+1:])
        cookies=dict(zip(a,b)) 
    return cookies

获取文章链接

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
"content-type": "application/json; charset=UTF-8",
"Connection": "keep-alive"
} # 需要替换为自己的请求头

f= open("这是一个日期的列表文件的路径.txt", "r",encoding="cp936")
# 需要自己事先创建一个简单的日期列表文件,格式如:1998/01/01,具体囊括多少日期视爬取需求而定
line = f.readlines()

for line in line[8596:]: # 如果链接发生变化,随时切片再次爬取
    date = line.strip().split("\t")[0]
    with open("这是cookies的路径.txt", "r")as g:
        cookies = g.read()
        cookies = json.loads(cookies)
    session = requests.session()
    url = "https://www.wsj.com/news/archive/" + date
    data = session.get(url, headers=headers, cookies = cookies)
    
    time.sleep(1)
    
    try:
        soup = BeautifulSoup(data.content, 'html.parser') 
        urls = [i.a['href'] for i in soup.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
        articles = [i.text for i in soup.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
        articles = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n"," ").replace('\t',"") for i in articles if len(i)>=1]
        modu=soup.find_all('div', {'class':'WSJTheme--overflow-hidden--qJmlzHgO'})
        categories = [i.find('div').text.strip().split("\t")[0] for i in modu]
        categories = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n","").replace('\t',"") for i in categories if len(i)>=0]
        
        page_num = int(soup.find('span', {'class': "WSJTheme--pagepicker-total--Kl350I1l"}).text.strip().replace('of ', ''))
    
      
        with open("这里输入自己的路径.txt",'a') as j:
            for k, i in enumerate(categories):
                j.write(articles[k]+'\t'+ urls[k]+'\t'+categories[k]+'\t'+ date + '\n')     
                
        if page_num == 1: 
            print("pn=1")
    
        else: #翻页
            for pn in range(2, page_num+1):
                print(pn)
                time.sleep(1) 
                new_url = url+'?page='+str(pn)
                data1 = session.get(new_url, headers=headers, cookies = cookies)
                time.sleep(1)
                soup1 = BeautifulSoup(data1.content, 'html.parser')
                urls1 = [i.a['href'] for i in soup1.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
                articles1 = [i.text for i in soup1.find_all('div', {'class':'WSJTheme--headline--7VCzo7Ay'})]
                articles1 = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n"," ").replace('\t',"") for i in articles1 if len(i)>=1]
                modu1=soup1.find_all('div', {'class':'WSJTheme--overflow-hidden--qJmlzHgO'})
                categories1 = [i.find('div').text.strip().split("\t")[0] for i in modu1]
                categories1 = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n","").replace('\t',"") for i in categories1 if len(i)>=0]
                        
                
                with open("这里输入自己的路径.txt",'a') as j:
                    for k, i in enumerate(categories1):
                        j.write(articles1[k]+'\t'+ urls1[k]+'\t'+categories1[k]+'\t'+ date + '\n')    
        
    except Exception as e: #记录报错信息
        print(url, e)
        with open("这里输入自己的路径.txt",'a') as l:
            l.write(url+'\t'+ date)
        pass
                        

爬取文章内容

之前可以直接爬取网站的内容,不需要加载tmp文件。但似乎今年WSJ网站做了一些些更新,所以代码要更复杂一些。

这里先呈现之前的代码。

# 旧的获取内容方法

def get_headers_and_cookies():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3542.0 Safari/537.36',
    "content-type": "application/json; charset=UTF-8",
    "Connection": "keep-alive"
    }
    with open("wsjcookies.txt", "r")as f:
        cookies = f.read()
        cookies = json.loads(cookies)
    return headers,cookies

def open_file():
    #这里是已经整理过的文章链接
    f2= open("文章链接路径.txt", "r",encoding="cp936")
    lines = f2.readlines()
    return lines

def get_text(url,date,title,headers,cookies):
    errorlog = []
    try:
        # requesting
        data = session.get(url, headers=headers, cookies = cookies)
        time.sleep(random.random())
        soup = BeautifulSoup(data.content, 'html.parser') 
        # parsing
        heads = soup.find('div', {'class':'wsj-article-headline-wrap'}).text
        content = soup.find('div', {'class':'article-content'}).text
        content = heads + content
        # saving
        with open ('输入存储路径/%s_%s.txt'%(date, title),'w',encoding='utf-8',errors='ignore') as j:
            j.write(content)
            j.close()
    
    except Exception as e:
        print(url, e)
        with open ('输入存储路径/%s_%s.txt'%(date, title),'w',encoding='utf-8',errors='ignore') as j:
            j.write(title)
            j.close()
        errorlog.append([url, e])
        pass
    
def main():
    lines = open_file()
    headers,cookies= get_headers_and_cookies()
    session = requests.session()
    for line in lines:
        linenew = line.split('\t')
        title = linenew[0].replace('/','-')
        url = linenew[1]
        category = linenew[2]
        date = linenew[3].replace('\n','').replace('/','-')
        get_text(url,date,title,headers,cookies)
        
if __name__ == '__main__':
    main()

现在呈现更新版的代码。

# 首先加载配置文件,具体加载方法见上一篇分享
option = webdriver.ChromeOptions()
option.add_argument(r"user-data-dir=配置文件路径/tmp")# 加载配置文件夹,直接command+shift+g 查找文件路径
driver = webdriver.Chrome(options=option)
class crawling():
    
    def __init__(self,driver,num1,num2,file):
        self.lines = open("这里是文章列表的文件.txt", "r",encoding="cp936").readlines()
        self.driver = driver
        self.num1 = num1
        self.num2 = num2
        self.file = file
            
    def get_article(self):
        errorlog = []
        driver = self.driver
        url = self.url
        date = self.date
        title = self.title
        newlink = url.split('/')[-1].split('.')[0].replace('?',"---") 
        if len(newlink) <= 200:
            try:
                driver.get(url)
                time.sleep(8)
                titlename = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/div[1]/div')
                writername = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/div[1]/div')
                article = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/section')
                content = titlename.text + '\n' + writername.text + '\n' + article.text
                time.sleep(1)
                with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink),'w',encoding='utf-8',errors='ignore') as j:
                    j.write(content)
                    j.close()
            except Exception as e:
                #print(url, e)
                with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink),'w',encoding='utf-8',errors='ignore') as j:
                    j.write(title)
                    j.close()
                errorlog.append([url, e])
                pass
            
        else:
            newlink2 = newlink[:200] # 规避过长的文件名
            try:
                driver.get(url)
                time.sleep(8)
                titlename = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/div[1]/div')
                writername = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/div[1]/div')
                article = driver.find_element(By.XPATH,'//*[@id="__next"]/div/main/div[2]/article/div[2]/section')
                content = titlename.text + '\n' + writername.text + '\n' + article.text + newlink
                time.sleep(1)
                with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink2),'w',encoding='utf-8',errors='ignore') as j:
                    j.write(content)
                    j.close()
            except Exception as e:
                title2 = title + '\n' + newlink
                #print(url, e)
                with open ('存储路径/'+self.file+'/%s_%s.txt'%(date, newlink2),'w',encoding='utf-8',errors='ignore') as j:
                    j.write(title2)
                    j.close()
                errorlog.append([url, e])
                pass
            
    def main(self):
        for line in self.lines[self.num1:self.num2]:
            linenew = line.split('\t')
            self.title = linenew[0].replace('/','-')
            self.url = linenew[1]
            self.category = linenew[2]
            self.date = linenew[3].replace('\n','').replace('/','-')
            self.get_article()
#多线程运行

def muti_crawling(path,n1,n2,file):
    option = webdriver.ChromeOptions()
    option.add_argument(path)
    driver = webdriver.Chrome(options=option)
    a = crawling(driver,n1,n2,file)
    a.main()

if __name__ == '__main__':
    t1 = Thread(target=muti_crawling,args=(r'配置文件路径/tmp',0,45945,'1998')) #前两个数字为起止链接的位序,后一个数字代表年份
    t2 = Thread(target=muti_crawling,args=(r'配置文件路径/tmp2',45945,94758,'1999'))

    # 多线程,随时调整
    # 无文字新闻以及网站404不能爬取
    # 每个线程分不同文档分别储存
    t1.start()
    t2.start()
# 定位分割点
li = []
lines = open("文章列表路径.txt", "r",encoding="cp936").readlines()
for line in lines:
    linenew = line.split('\t')
    date = linenew[3].replace('\n','').replace('/','-')
    li.append(date)
index = li.index('2004-01-01')
index

以上。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值