python 爬虫抓取网页数据导出excel_【python入门爬虫】爬取公众号搜索结果导出为Excel...

[Python] 纯文本查看 复制代码from selenium import webdriver

from selenium.webdriver.chrome.options import Options

import time

import json

import requests

from openpyxl import Workbook

def savecookies(cookies):

with open("./cookies.json", "w") as fp:

json.dump(cookies, fp)

def setcookies(driver):

with open("./cookies.json", "r") as fp:

cookies = json.load(fp)

for cookie in cookies:

# for k,v in cookie.items():

# item = {'name':k,"value":v}

driver.add_cookie(cookie_dict=cookie)

return driver

def login(driver):

loginURL="https://mp.weixin.qq.com/";

driver.get(loginURL)

print("打开页面中...")

time.sleep(2)

#重新获取当前的url,当url不等于loginURL证明已经登录

if(str(driver.current_url).find("token=")>0):

print("已经登录")

else:

time.sleep(2)

#监听用户是否扫码登录,登录跳出循环

while driver.current_url == loginURL:

time.sleep( 0.5 )

print("----登录成功");

#保存cookies

savecookies(driver.get_cookies())

#获取token

driver.get("https://mp.weixin.qq.com");

nowURL = driver.current_url

start = str(nowURL).find("token=")+6

token = str(nowURL)[start:]

return token

def search(word,token,begin,count):

headers = {

"HOST": "mp.weixin.qq.com",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",

}

#request设置cookies

cookies_dict = dict()

with open("./cookies.json", "r") as fp:

cookies = json.load(fp)

for cookie in cookies:

if cookie['name'] == 'pgv_si':

continue

if cookie['name'] == 'uuid':

continue

cookies_dict[cookie['name']] = cookie['value']

cookiesJar = requests.utils.cookiejar_from_dict(cookies_dict, cookiejar=None,overwrite=True)

session = requests.session()

session.cookies=cookiesJar

session.headers =headers

#模拟get请求

res = session.get("https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin="+str(begin)+"&count="+str(count)+"&query="+word+"&token="+str(token)+"&lang=zh_CN&f=json&ajax=1")

print(res.text)

return json.loads(res.text)

def save(datas):

#解析报文

resultList = datas['list']

for item in resultList:

#组成一行

row = [item['fakeid'],item['nickname'],item['round_head_img'],item['alias'],item['service_type']]

#写入

sheet.append(row)

if __name__ == '__main__':

#新建一个excel

book = Workbook()

sheet= book._sheets[0]

option = Options()

# 以下为不显示浏览器界面进行运行的设置

# option.add_argument('--headless')

# option.add_argument('--disable-gpu')

# option.add_argument("window-size=1024,768")

# option.add_argument("--no-sandbox")

option.add_argument(r"user-data-dir=D:\WeChat")#将浏览器的数据进行保存,第二次浏览可以不用在登录

#加载设置和驱动

driver = webdriver.Chrome(options= option,executable_path="./chromedriver.exe")

#登录并获取token

token = login(driver)

#编写标题行

row = ['fakeid','nickname','round_head_img','alias','service_type']

#写入

sheet.append(row)

#开始爬取搜索公众号的结果

begin = 0

while True:

#模拟get请求

res = search("笔吧",token,begin,5)

#写入到excel

save(res)

#翻页

begin+=5

#当最后一页时,返回的数据个数小于count跳出循环

if len(res['list'])<5:

break

#保存

book.save("./WeChat.xlsx")

driver.quit()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值