python pandas教程百家号_python爬取百家号文章

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import xlwt

from selenium import webdriver

# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import time

import requests

import hashlib

from lxml import etree

class Bjh():

def __init__(self):

self.wb = xlwt.Workbook()

chrome_options = webdriver.ChromeOptions()

# extension_path = r'D:\python\work\bj3y\1.0.2_0.crx'

path = "chromedriver"

# chrome_options.add_experimental_option('w3c', False)

mobileEmulation = {'deviceName': 'iPhone 6/7/8 Plus'}

chrome_options.add_experimental_option('mobileEmulation', mobileEmulation)

# chrome_options.add_argument('--headless')

# chrome_options.add_argument("--proxy-server=http://%s"%self.get_ip())

# chrome_options.add_argument('user-agent=%s'%random_ua())

chrome_options.add_argument("user-data-dir=" + r"C:\Users\redhat\AppData\Local\Google\Chrome\User Data")

self.driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)

def open(self,url):

self.driver.get(url=url)

def hua(self):

time.sleep(1)

print("滑动加载中")

self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

def check_state(self):

# pageSource = self.driver.page_source

# html = etree.HTML(pageSource)

# data = html.xpath('//*[@id="article"]/div/div/div[-1]/div')[0]

data=self.driver.find_element_by_class_name("s-loader").get_attribute('innerHTML')

state=str(data).split("\n")[1][-3:-2]

return state

def check_state_video(self):

# pageSource = self.driver.page_source

# html = etree.HTML(pageSource)

# data = html.xpath('//*[@id="article"]/div/div/div[-1]/div')[0]

data = self.driver.find_elements_by_class_name("s-loader")[1].get_attribute('innerHTML')

state=str(data).split("\n")[1][-3:-2]

return state

def get_article(self):

html = self.driver.page_source

html = etree.HTML(html)

article=html.xpath('//*[@id="article"]/div/div/div')

l=[]

for i in article:

title=i.xpath('div/div/div/div/div/div/div[2]/div[1]/text()')

if title==[]:

title = i.xpath('div/div/div/div/div[1]/text()')

read=i.xpath('div/div/div/div/div[3]/span/text()')

if read ==[]:

read = i.xpath('div/div/div/div/div/div/div[2]/div[2]/span/text()')

data=title+read

l.append(data)

return l

def tab_video(self):

ele=self.driver.find_element_by_xpath('//*[@id="app"]/div/div[3]/div/div[1]/div/div/div[4]')

ele.click()

def get_video(self):

html = self.driver.page_source

html = etree.HTML(html)

article=html.xpath('//*[@id="video"]/div/div/div')

l=[]

for i in article:

title=i.xpath('div/div/div[1]/div[2]/div/div[1]/text()')

if title==[]:

title = i.xpath('div/div/div/div/div[1]/text()')

read=i.xpath('div/div/div[1]/div[2]/div/div[3]/span/text()')

if read ==[]:

read = i.xpath('div/div/div/div/div/div/div[2]/div[2]/span/text()')

data=title+read

l.append(data)

return l

def write(self,title,type,data):

# wb = xlwt.Workbook()

# 添加sheet

ws = self.wb.add_sheet(type)

index = 0

for i in data:

if i==[]:

continue

ws.write(index, 0, i[0])

ws.write(index, 1, i[1])

ws.write(index, 2, i[2])

if type=="article":

ws.write(index, 3, i[3])

index += 1

# def run(self,url,title,type):

# self.open(url)

# if type=="article":

# while True:

# bjh.hua()

# state = bjh.check_state()

# if state == "2":

# break

# data=self.get_article()

# else:

# self.tab_video()

# while True:

# bjh.hua()

# state = bjh.check_state_video()

# if state == "2":

# break

# data = self.get_video()

# self.write(title,type,data)

# self.driver.close()

# self.driver.quit()

def run(self,url,title):

self.open(url)

while True:

self.hua()

state = self.check_state()

if state == "2":

break

data=self.get_article()

self.write(title, "article", data)

self.driver.refresh()

time.sleep(1)

self.driver.refresh()

time.sleep(1)

self.driver.refresh()

time.sleep(3)

while True:

try:

time.sleep(1)

self.tab_video()

break

except Exception as e:

print(e)

self.driver.refresh()

while True:

self.hua()

state = self.check_state_video()

if state == "2":

break

data = self.get_video()

self.write(title,"video",data)

self.wb.save(title + ".xls")

self.driver.close()

self.driver.quit()

if __name__ == '__main__':

title=str(input("请输入标题:"))

url=str(input("请输入url:"))

bjh = Bjh()

bjh.run(url,title)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值