用selenium请求页面,BeautifulSoup解析Html,urlretrieve下载图片,scheduler等待页面的JS加载
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
from urllib.request import urlretrieve
import os,json,time,sched
from urllib.parse import urlparse,urljoin
from bs4 import BeautifulSoup
class GetSouGou:
def __init__(self,key):
self.path = "d:\\sougou"
self.key=key
self.url = "https://weixin.sogou.com/weixinwap?query={}&type={}&ie=utf8&_sug_=y&_sug_type_=&s_from=input"
#初始化 driver
self.options = Options()
#self.options.add_argument('--headless')
#self.options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=self.options)
#初始化文件夹
if not (os.path.exists(self.path)):
os.mkdir(path)
self.path="{}\\{}".format(self.path,self.key)
if not (os.path.exists(self.path)):
os.mkdir(self.path)
#保存文章
def save_article(self,id):
soup = BeautifulSoup(self.driver.page_source, "html.parser")
# print(soup.find(id="js_content").find_all("p"))
article_content=[]
for p in soup.find(id="js_content").find_all("p"):
article_content.append(p.get_text())
print(article_content)
with open('{}\\{}.txt'.format(self.path,id), 'w', encoding='utf-8') as fObj:
json.dump(article_content, fObj, ensure_ascii=False)
def save_gzh(html):
pass
#爬文章列表
def get_article_list(self, type):
self.driver.get(self.url.format(self.key,type))
first_page=etree.HTML(self.driver.page_source.replace("<em><!--red_beg-->","").replace("<!--red_end--></em>",""))
li_ids = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li/@id")
self.article_list=[]
for id in li_ids:
item = {}
item["id"] = id
titles = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/h4/a/div/text()".format(id))
item["title"] = titles[0] if len(titles) > 0 else ""
summarys = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/a/p/text()".format(id))
item["summary"] = summarys[0] if len(summarys) > 0 else ""
hrefs = first_page.xpath(
"//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div[@class='pic']/a/@href".format(id))
item["href"] = hrefs[0] if len(hrefs) > 0 else ""
#部分文章的地址不一样
href2s = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/div/a/@href".format(id))
if (len(href2s) > 0):
item["href"] = href2s[0]
pics = first_page.xpath(
"//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div[@class='pic']/a/img/@src".format(id))
item["pic"] = pics[0] if len(pics) > 0 else ""
#部分文章的图片地址不一样
pic2s = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/div/a/img/@src".format(id))
if len(pic2s) > 0:
item["pic"] = pic2s[0]
#print(item)
self.article_list.append(item)
#保存文章列表
#print(self.article_list)
with open('{}\\first_page.txt'.format(self.path), 'w', encoding='utf-8') as fObj:
json.dump(self.article_list, fObj, ensure_ascii=False)
#下载文章图片、文章内容
def download(self):
for item in self.article_list:
id=item["id"]
pic=item["pic"]
#获得图片存储位置
pic=pic[pic.index("url")+4:]
#print(id)
#下载图片
urlretrieve(pic,"d:\\sougou\\"+self.key+"\\"+id+".png")
# 打开文章链接网页
#print("https://weixin.sogou.com"+item["href"])
self.driver.get("https://weixin.sogou.com"+item["href"])
#self.driver.get_screenshot_as_file(".\{}.png".format(id))
#延时5秒,等待页面跳转完后爬取文章内容
scheduler = sched.scheduler(time.time, time.sleep)
scheduler.enter(5, 0, self.save_article,(id,))
scheduler.run()
time.sleep(3)
#self.driver.quit()
if __name__ == '__main__':
gsg = GetSouGou("python")
article_list=gsg.get_article_list("2")
gsg.download()
#gsg.save_article(html)
gsg.driver.close()