使用python对json文件的分析爬取今日头条的文章并进行处理。
如今python 爬虫的方式可以I使用BeautifulSoup,当遇到有动态的js文件时, 可以使用Selenium 进行模拟操作,但是使用这些方法来处理今日头条的搜索连接时毫无作用。我选择的方法其实也是一个绕弯的方法,先将json文件本地化,在进行json文件中数据的分析。
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
#from selenium import webdriver
import webbrowser
import random
import requests #用来抓取网页的html源代码
import socket
import os
import json
from bs4 import BeautifulSoup #用来代替正则表达式取源码中相应标签的内容
import time
import http.client #用做异常处理
import tkinter as tk
import win32clipboard as w
# 打开chrome浏览器
urls=[]
def get_html(url,encodeing):
"""
模拟浏览器来获取网页的html代码
"""
header={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
#设定超时时间,取随机数是因为防止被网站认为是爬虫
rep=""
timeout=random.choice(range(80,180))
while True:
try:
rep=requests.get(url,headers=header,timeout=timeout)
rep.encoding=encodeing
break
except socket.timeout as e:
print("3:",e)
time.sleep(random.choice(range(8,15)))
except socket.error as e:
print("4:",e)
time.sleep(random.choice(range(20,60)))
except http.client.BadStatusLine as e:
print("5:",e)
time.sleep(random.choice(range(30,80)))
except http.client.IncompleteRead as e:
print("6:",e)
time.sleep(random.choice(range(5,15)))
return rep.text
def get_p_text(urls):
RES=''
for url in urls:
print(url)
body = get_html(url, 'utf-8')
bs = BeautifulSoup(body, "html.parser") # 创建BeautifulSoup对象
body = bs.body # 获取body部分
car_body_p = body.find_all("p")
print(car_body_p)
for index in car_body_p:
if('新车尾部' in index.get_text() and index.get_text() not in RES):
RES+='<p style="color:red;">'+index.get_text()+'</p>'+ '\n'
write_in_browser(RES)
if ('外观' in index.get_text() and index.get_text() not in RES):
RES+='<p style="color:black;">'+index.get_text()+'</p>'+ '\n'
write_in_browser(RES)
if ('动力' in index.get_text() and index.get_text() not in RES):
RES +='<p style="color:green;">'+ index.get_text()+'</p>'+ '\n'
write_in_browser(RES)
if ('配置' in index.get_text()and index.get_text() not in RES):
RES +='<p style="color:red;">'+ index.get_text() + '</p>'+ '\n'
write_in_browser(RES)
if ('侧面' in index.get_text() and index.get_text() not in RES ):
RES +='<p style="color:blue;">'+index.get_text() +'</p>'+ '\n'
write_in_browser(RES)
return RES
def geturls():
while(True):
if(os.path.exists("C:\\Users\\Administrator\\Desktop\\Version4\\下载.json")==True and
os.path.exists("C:\\Users\\Administrator\\Desktop\\Version4\\下载 (1).json")==True):
filename='C:\\Users\\Administrator\\Desktop\\Version4\\下载.json'
with open(filename,'r',encoding='utf-8') as file:
data=json.load(file)
for index in range(1,20):
if 'article_url' in data['data'][index].keys():
#print(data['data'][index]['article_url'])
urls.append(data['data'][index]['article_url'])
os.remove("C:\\Users\\Administrator\\Desktop\\Version4\\下载.json")
filename='C:\\Users\\Administrator\\Desktop\\Version4\\下载 (1).json'
with open(filename,'r',encoding='utf-8') as file:
data=json.load(file)
for index in range(1,20):
if 'article_url' in data['data'][index].keys():
urls.append(data['data'][index]['article_url'])
os.remove("C:\\Users\\Administrator\\Desktop\\Version4\\下载 (1).json")
break
else:
print("不存在!")
time.sleep(1)
GEN_HTML = "wang2.html"
def write_in_browser(RES):
# 打开文件,准备写入
f = open(GEN_HTML, 'w', encoding='utf-8')
# 写入文件
f.write(RES)
# 关闭文件
f.close()
#开始运行
def start():
webbrowser.open("https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E6%9C%97%E9%80%B8&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis×tamp=1564664126248")
time.sleep(1)
webbrowser.open("https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=20&format=json&keyword=%E6%9C%97%E9%80%B8&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis×tamp=1564664126248")
time.sleep(1)
geturls()
print(urls)
for index in urls:
write_in_browser(index)
print(get_p_text(urls))
username.set("")
window = tk.Tk()
window.title("呆萌小爬虫")
sw = window.winfo_screenwidth()
#得到屏幕宽度
sh = window.winfo_screenheight()
#得到屏幕高度
ww = 190
wh = 50
#窗口宽高为100
x = (sw-ww) / 2
y = (sh-wh) / 2
window.geometry("%dx%d+%d+%d" %(ww,wh,x,y))
window.resizable(0,0)
#地点
username=tk.StringVar()
username.set("")
tk.Label(window,text='车型').grid(row=0,column=0)
username_entry=tk.Entry(window,textvariable=username)
username_entry.grid(row=0,column=1)
#查询按钮
bt_ok=tk.Button(window,text='爬起',command=start)
window.bind('<Return>', start)
bt_ok.grid(row=6,columnspan=2)
#打包
window.mainloop()