使用python对json文件的分析爬取今日头条的文章并进行处理

最新推荐文章于 2024-06-27 18:14:28 发布

王子健121

最新推荐文章于 2024-06-27 18:14:28 发布

阅读量536

点赞数 1

文章标签： python 爬虫

本文链接：https://blog.csdn.net/wangzijian121/article/details/98484396

版权

小知识，积少成多专栏收录该内容

9 篇文章 0 订阅

订阅专栏

使用python对json文件的分析爬取今日头条的文章并进行处理。

如今python 爬虫的方式可以I使用BeautifulSoup，当遇到有动态的js文件时，可以使用Selenium 进行模拟操作，但是使用这些方法来处理今日头条的搜索连接时毫无作用。我选择的方法其实也是一个绕弯的方法，先将json文件本地化，在进行json文件中数据的分析。

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
#from selenium import webdriver
import webbrowser
import random
import requests #用来抓取网页的html源代码
import socket
import os
import json
from bs4 import BeautifulSoup   #用来代替正则表达式取源码中相应标签的内容
import time
import http.client #用做异常处理
import tkinter as tk
import win32clipboard as w
# 打开chrome浏览器

urls=[]
def get_html(url,encodeing):
    """
    模拟浏览器来获取网页的html代码
    """
    header={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
    }
    #设定超时时间，取随机数是因为防止被网站认为是爬虫
    rep=""
    timeout=random.choice(range(80,180))
    while True:
        try:
            rep=requests.get(url,headers=header,timeout=timeout)
            rep.encoding=encodeing
            break
        except socket.timeout as e:
            print("3:",e)
            time.sleep(random.choice(range(8,15)))
        except socket.error as e:
            print("4:",e)
            time.sleep(random.choice(range(20,60)))
        except http.client.BadStatusLine as e:
            print("5:",e)
            time.sleep(random.choice(range(30,80)))
        except http.client.IncompleteRead as e:
            print("6:",e)
            time.sleep(random.choice(range(5,15)))
    return rep.text


def get_p_text(urls):
        RES=''
        for url in urls:
            print(url)
            body = get_html(url, 'utf-8')
            bs = BeautifulSoup(body, "html.parser")  # 创建BeautifulSoup对象
            body = bs.body  # 获取body部分
            car_body_p = body.find_all("p")
            print(car_body_p)
            for index in car_body_p:
                if('新车尾部' in index.get_text() and index.get_text() not in RES):
                    RES+='<p style="color:red;">'+index.get_text()+'</p>'+ '\n'
                    write_in_browser(RES)
                if ('外观' in index.get_text() and index.get_text() not in RES):
                    RES+='<p style="color:black;">'+index.get_text()+'</p>'+ '\n'
                    write_in_browser(RES)
                if ('动力' in index.get_text() and index.get_text() not in RES):
                    RES +='<p style="color:green;">'+ index.get_text()+'</p>'+ '\n'
                    write_in_browser(RES)
                if ('配置' in index.get_text()and index.get_text() not in RES):
                    RES +='<p style="color:red;">'+ index.get_text() + '</p>'+ '\n'
                    write_in_browser(RES)
                if ('侧面' in index.get_text() and index.get_text() not in RES ):
                    RES +='<p style="color:blue;">'+index.get_text() +'</p>'+ '\n'
                    write_in_browser(RES)
        return RES
def geturls():
    while(True):
        if(os.path.exists("C:\\Users\\Administrator\\Desktop\\Version4\\下载.json")==True and 
           os.path.exists("C:\\Users\\Administrator\\Desktop\\Version4\\下载 (1).json")==True):
            filename='C:\\Users\\Administrator\\Desktop\\Version4\\下载.json'
            with open(filename,'r',encoding='utf-8') as file:
                data=json.load(file)
                for index in range(1,20):
                    if 'article_url' in  data['data'][index].keys():
                        #print(data['data'][index]['article_url'])
                        urls.append(data['data'][index]['article_url'])
            os.remove("C:\\Users\\Administrator\\Desktop\\Version4\\下载.json")  
            filename='C:\\Users\\Administrator\\Desktop\\Version4\\下载 (1).json'
            with open(filename,'r',encoding='utf-8') as file:
                data=json.load(file)
                for index in range(1,20):
                    if 'article_url' in  data['data'][index].keys():
                        urls.append(data['data'][index]['article_url'])
            os.remove("C:\\Users\\Administrator\\Desktop\\Version4\\下载 (1).json")
            break
        else:
            print("不存在！")
            time.sleep(1)
            
GEN_HTML = "wang2.html"
def write_in_browser(RES):
    # 打开文件，准备写入
    f = open(GEN_HTML, 'w', encoding='utf-8')
    # 写入文件
    f.write(RES)
    # 关闭文件
    f.close()
#开始运行
def start():
    webbrowser.open("https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E6%9C%97%E9%80%B8&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis&timestamp=1564664126248")
    time.sleep(1)
    webbrowser.open("https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=20&format=json&keyword=%E6%9C%97%E9%80%B8&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis&timestamp=1564664126248")
    time.sleep(1) 
    geturls()
    print(urls)
    for index in urls:
        write_in_browser(index)
    print(get_p_text(urls))
    username.set("")
window = tk.Tk()
window.title("呆萌小爬虫")
sw = window.winfo_screenwidth()
#得到屏幕宽度
sh = window.winfo_screenheight()
#得到屏幕高度
ww = 190    
wh = 50
#窗口宽高为100
x = (sw-ww) / 2
y = (sh-wh) / 2
window.geometry("%dx%d+%d+%d" %(ww,wh,x,y))
window.resizable(0,0)
#地点
username=tk.StringVar()
username.set("")
tk.Label(window,text='车型').grid(row=0,column=0)
username_entry=tk.Entry(window,textvariable=username)
username_entry.grid(row=0,column=1)
#查询按钮 
bt_ok=tk.Button(window,text='爬起',command=start)
window.bind('<Return>', start)
bt_ok.grid(row=6,columnspan=2)

#打包
window.mainloop()

王子健121

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
打赏
0
评论
使用python对json文件的分析爬取今日头条的文章并进行处理

使用python对json文件的分析爬取今日头条的文章并进行处理。如今python 爬虫的方式可以I使用BeautifulSoup，当遇到有动态的js文件时，可以使用Selenium 进行模拟操作，但是使用这些方法来处理今日头条的搜索连接时毫无作用。我选择的方法其实也是一个绕弯的方法，先将json文件本地化，在进行json文件中数据的分析。# -*- coding: utf-8 -*-""...
复制链接

扫一扫