python+selenium+beautifulsoup+xpath爬搜狗微信

最新推荐文章于 2021-12-11 13:18:47 发布

玩玩代码而已

最新推荐文章于 2021-12-11 13:18:47 发布

阅读量390

点赞数

文章标签： python 搜狗 selenium xpath

本文链接：https://blog.csdn.net/qq_26331339/article/details/105608021

版权

用selenium请求页面，BeautifulSoup解析Html，urlretrieve下载图片，scheduler等待页面的JS加载

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
from urllib.request import urlretrieve
import  os,json,time,sched
from urllib.parse import urlparse,urljoin
from bs4 import  BeautifulSoup

class GetSouGou:
    def __init__(self,key):
       self.path = "d:\\sougou"
       self.key=key
       self.url = "https://weixin.sogou.com/weixinwap?query={}&type={}&ie=utf8&_sug_=y&_sug_type_=&s_from=input"
       #初始化 driver
       self.options = Options()
       #self.options.add_argument('--headless')
       #self.options.add_argument('--disable-gpu')
       self.driver = webdriver.Chrome(options=self.options)
        #初始化文件夹
       if not (os.path.exists(self.path)):
           os.mkdir(path)
       self.path="{}\\{}".format(self.path,self.key)
       if not (os.path.exists(self.path)):
           os.mkdir(self.path)
    #保存文章
    def save_article(self,id):
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        # print(soup.find(id="js_content").find_all("p"))
        article_content=[]
        for p in soup.find(id="js_content").find_all("p"):
            article_content.append(p.get_text())
        print(article_content)
        with open('{}\\{}.txt'.format(self.path,id), 'w', encoding='utf-8') as fObj:
            json.dump(article_content, fObj, ensure_ascii=False)

    def save_gzh(html):
        pass

    #爬文章列表
    def get_article_list(self, type):
        self.driver.get(self.url.format(self.key,type))
        first_page=etree.HTML(self.driver.page_source.replace("<em><!--red_beg-->","").replace("<!--red_end--></em>",""))
        li_ids = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li/@id")
        self.article_list=[]
        for id in li_ids:
            item = {}
            item["id"] = id
            titles = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/h4/a/div/text()".format(id))
            item["title"] = titles[0] if len(titles) > 0 else ""
            summarys = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/a/p/text()".format(id))
            item["summary"] = summarys[0] if len(summarys) > 0 else ""
            hrefs = first_page.xpath(
                "//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div[@class='pic']/a/@href".format(id))
            item["href"] = hrefs[0] if len(hrefs) > 0 else ""
            #部分文章的地址不一样
            href2s = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/div/a/@href".format(id))
            if (len(href2s) > 0):
                item["href"] = href2s[0]
            pics = first_page.xpath(
                "//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div[@class='pic']/a/img/@src".format(id))
            item["pic"] = pics[0] if len(pics) > 0 else ""
            #部分文章的图片地址不一样
            pic2s = first_page.xpath("//*[@id='mainBody']/div[3]/div/div/ul/li[@id='{}']/div/div/a/img/@src".format(id))
            if len(pic2s) > 0:
                item["pic"] = pic2s[0]
            #print(item)
            self.article_list.append(item)
        #保存文章列表
        #print(self.article_list)
        with open('{}\\first_page.txt'.format(self.path), 'w', encoding='utf-8') as fObj:
            json.dump(self.article_list, fObj, ensure_ascii=False)

    #下载文章图片、文章内容
    def download(self):
        for item in self.article_list:
            id=item["id"]
            pic=item["pic"]
            #获得图片存储位置
            pic=pic[pic.index("url")+4:]
            #print(id)
            #下载图片
            urlretrieve(pic,"d:\\sougou\\"+self.key+"\\"+id+".png")
            # 打开文章链接网页
            #print("https://weixin.sogou.com"+item["href"])
            self.driver.get("https://weixin.sogou.com"+item["href"])
            #self.driver.get_screenshot_as_file(".\{}.png".format(id))
            #延时5秒，等待页面跳转完后爬取文章内容
            scheduler = sched.scheduler(time.time, time.sleep)
            scheduler.enter(5, 0, self.save_article,(id,))
            scheduler.run()
            time.sleep(3)
        #self.driver.quit()

if __name__ == '__main__':
    gsg = GetSouGou("python")
    article_list=gsg.get_article_list("2")
    gsg.download()
    #gsg.save_article(html)
    gsg.driver.close()