[Python] 纯文本查看 复制代码import requests
import re
import json
import time
from lxml import etree
import dill
# 全局变量
tieba_prefix = "http://tieba.baidu.com"
userdict = {}
# 参数信息类
class para:
headers = None
cookies = None
max_loop = None
max_page = None
max_num = None
# 用户信息类
class userinfo(object):
def __init__(self, url):
self.url = url
self.id = None
self.username = None
self.age = None
self.tie = None
self.sex = None
self.concern_num = None
self.concern_url = None
self.concern_list = []
self.fans_num = None
self.fans_url = None
self.fans_list = []
# 保存到文件
def saveToFile(self):
dictObj = {
"url": self.url,
"id": self.id,
"username": self.username,
"age": self.age,
"tie": self.tie,
"sex": self.sex,
"concern_num": self.concern_num,
"concern_url": self.concern_url,
"fans_num": self.fans_num,
"fans_url": self.fans_url
}
# url解析
def getHtmlFromUrl(url, loop_info):
response = requests.get(url, headers=para.headers, cookies=para.cookies)
print('当前页面:' + url)
print(loop_info)
if response.status_code == 200:
# 很抱歉,您要访问的页面不存在。
if response.url == 'http://static.tieba.baidu.com/tb/error.html?ErrType=1':
data = response.content.decode('gbk') # gbk编码
html = etree.HTML(data)