爬取新闻列表并发微信指定人/群
爬取新闻列表并发微信指定人/群
1. 目标地址:https://www.jiemian.com/lists/800.html
2. 获取新闻列表
2.1 requests方法
import time, requests, json, bs4
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}
session = requests.session()
res = session.get('https://www.jiemian.com/lists/800.html', headers = headers)
text = res.content.decode('utf-8')
soup = bs4.BeautifulSoup(text, 'lxml')
list_content = soup.find_all('li',{'class':'card-list'})
for i in list_content:
a2 = i.find_all('h3',{'class':'card-list__title'})[0].text
a3 = i.find_all('a',{'class':'logStore'})[0]['href']
content_jm = a2 + ' ' + a3
print(content_jm)
结果如下:
3. 获取新闻内容
res_article = session.get(a3, headers = headers)
text_article = res_article.content.decode('utf-8')
soup_article = bs4.BeautifulSoup(text_article, 'lxml')
str_article_content = soup_article.find_all('div',{'class':'article-content'})[0].text
article_content = ''
for j in str_article_content.split('\n'):
article_content = article_content + j + '\n'
结果如下:
2.2 webdriver方法
import os, time, datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
def isElementPresent(by, value):
try:
element =driver.find_element(by, value=value)
except NoSuchElementException as e:
return False
else:
return True
driver = webdriver.Chrome()
driver.get('https://www.jiemian.com/lists/800.html')
driver.maximize_window()
for i in range(1,17):
bt = driver.find_element(By.XPATH,'//*[@id="load-list"]/li[' + str(i) + ']/div[2]/div[1]/a/h3').text
if bt not in bt_list:
driver.find_element(By.XPATH,'//*[@id="load-list"]/li[' + str(i) + ']/div[2]/div[1]/a/h3').click()
time.sleep(1)
driver.switch_to.window(driver.window_handles[-1])
if isElementPresent(By.XPATH, '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[3]/h1'):
bt = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[3]/h1').text.replace('|','')
if isElementPresent(By.XPATH, '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[4]/h1'):
bt = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[4]/h1').text.replace('|','')
zy = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[4]/p').text
if isElementPresent(By.XPATH, '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[6]/div[2]'):
zw = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[6]/div[2]').text
if isElementPresent(By.XPATH, '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[5]/div[2]'):
zw = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[5]/div[2]').text
with open('list\\' + bt + '.txt', 'w', encoding='utf-8') as cont:
cont.write(zy + '\n' + '\n' + zw.split('\n')[0] + '\n' + '\n' + '\n'.join(zw.split('\n')[1:]))
cont.close()
if isElementPresent(By.XPATH, '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[6]/div[1]/img'):
img_link = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[6]/div[1]/img').get_attribute('src')
if isElementPresent(By.XPATH, '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[5]/div[1]/img'):
img_link = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/div/div[2]/div[1]/div[1]/div[5]/div[1]/img').get_attribute('src')
r = requests.get(img_link)
with open('list\\' + bt + '.' + img_link.split('.')[-1], mode='wb') as f:
f.write(r.content)
f.close()
with open('list.txt', 'a', encoding='utf-8') as bt_list_open:
bt_list_open.write('\n' + bt)
bt_list_open.close()
driver.close()
driver.switch_to.window(driver.window_handles[0])
3. 发送至微信指定联系人
wechat_window = WindowControl(searchDepth=1, Name="微信", ClassName="WeChatMainWndForPC")
wechat_window.SetActive()
time.sleep(1)
keyboard.send_keys('^f')
keyboard.send_keys('重点新闻')
time.sleep(1)
keyboard.send_keys('{ENTER}')
time.sleep(1)
i = a2 + '\n' + article_content + '链接:' + a3
wechat_window = WindowControl(searchDepth=1, Name="微信", ClassName="WeChatMainWndForPC")
wechat_window.SetActive()
time.sleep(1)
keyboard.send_keys(a2 + '^~' + article_content + '链接:' + a3)
keyboard.send_keys('{ENTER}')
with open('content.txt', 'w') as file:
file.write(content + i + '\n')
file.close()
time.sleep(1)