py android 知乎,zhihu_pbm.py

#!/usr/bin/env python

# coding:utf-8

import re

import json

import os

import sys

import time

import urllib.parse

import requests

from requests.exceptions import RequestException

from bs4 import BeautifulSoup, element

REGEX_HTMLTAG_RM = re.compile("(]*>.*?(a|script)>)", re.S | re.I)

REGEX_HTMLTAG = re.compile("(]*>|[ ]+|<div>)", re.S)

def parse_contents(content):

if not content: return []

content = re.sub(REGEX_HTMLTAG_RM, "", content.prettify())

content = re.sub(REGEX_HTMLTAG, "", content)

result = [item.strip() for item in content.split("\n") if len(item) > 0 and not item.isspace()]

return result

def save(filename, contents):

with open(filename, 'w+', encoding="utf-8") as f:

for content in contents:

f.write(content + "\n")

def mkdir_p(path):

path_arrs = path.split('/')

tmp = ''

for path_single in path_arrs:

if path_single == '': continue

tmp += path_single + '/'

if not os.path.exists(tmp):

os.mkdir(tmp)

url = "https://www.zhihu.com/question/290543744"

headers = {

# "Accept": "text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8",

"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0"

}

response = requests.get(url, headers = headers, verify=False)

if response.status_code != 200:

print("download page error: status[%d]", response.status_code)

exit(1)

text = response.text

# charset = cls._get_charset(text)

# if not re.match(REGEX_CHARSET_UNENCODE, charset):

# text = text.encode('iso-8859-1')

html = BeautifulSoup(text, 'html.parser')

items = html.body.find_all(attrs={"class":"List-item"})

for item in items:

voters = item.find(attrs={"class":"Voters"}).button

voters = ''.join(voters.text.split(' ')[0].split(','))

answer = item.find(attrs={"class":"AnswerItem"})

answer = answer["name"]

content = item.find(attrs={"class":"CopyrightRichText-richText"})

contents = parse_contents(content)

path_name = "outputs/zhihu/" + url.split('/')[-1] + '/'

mkdir_p(path_name)

filename = path_name + voters + "-" + answer + '.txt'

save(filename, contents)

def test_pp:

url = 'https://www.zhihu.com/api/v4/questions/290543744/answers?include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_labeled,is_recognized,paid_info;data[*].mark_infos[*].url;data[*].author.follower_count,badge[*].topics&limit=5&offset=25&platform=desktop&sort_by=default'

headers = {

"Host": "www.zhihu.com",

"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0",

"Accept": "*/*",

"Accept-Language": "en-US,en;q=0.5",

"Accept-Encoding": "gzip, deflate, br",

"Referer": "https://www.zhihu.com/question/290543744",

"x-requested-with": "fetch",

"X-Ab-Param": "pf_foltopic_usernum=50;se_likebutton=0;top_ntr=1;zr_infinity_topic=close;top_ydyq=X;li_es_new=new;li_se_intervene=1;top_reason=1;top_user_cluster=0;ug_goodcomment=0;zr_km_answer=base;pf_feed=1;se_click_del=1;se_p_slideshow=0;top_nucc=0;li_se_ebook_chapter=1;se_ad_index=10;se_page_limit_20=1;top_quality=0;ug_follow_topic_1=2;qa_answerlist_ad=0;se_backsearch=0;top_new_user_rec=0;tp_discussion_feed_type_android=2;ug_newtag=0;gw_guide=0;se_ltr_0419=0;soc_special=0;top_wonderful=1;top_native_answer=1;top_v_album=1;zr_ads_search=0;li_se_new_card=1;pf_noti_entry_num=0;top_hotcommerce=1;tsp_lastread=0;top_new_feed=5;se_rewrite=0;qa_video_answer_list=0;top_rank=0;top_vipconsume=1;ls_fmp4=0;top_billupdate1=2;se_se_index=1;se_whitelist=0;ug_goodcomment_0=0;ug_zero_follow=0;se_km_ad_locate=1;se_zu_onebox=0;top_root=0;ug_zero_follow_0=0;se_webtimebox=0;soc_bignew=1;top_recall_deep_user=1;tp_sft=a;zr_answer_rec=close;zr_rel_search=base;se_billboardsearch=0;se_lottery=0;se_wannasearch=0;top_zh_tailuser=1;se_click_wiki=0;top_recall_exp_v1=1;zr_ans_rec=gbrank;zr_art_rec=base;soc_update=1;top_recall_exp_v2=1;zr_search_material=0;zr_video_rec_weight=close;li_qa_cover=old;se_expired_ob=0;se_featured=0;se_search_feed=N;se_title_only=0;se_webmajorob=0;top_ebook=0;top_universalebook=1;se_zu_recommend=0;tp_m_intro_re_topic=1;se_rr=0;zr_article_rec_rank=truncate;qa_test=0;se_ios_spb309=0;tp_qa_metacard=1;se_subtext=0;top_source=0;se_amovietab=0;se_spb309=0;top_vipoffice=1;tp_sft_v2= a;ug_follow_answerer=0;ug_fw_answ_aut_1=0;li_filter_ttl=2;li_lt_tp_score=1;se_payconsult=0;top_bill=0;pf_fuceng=1;se_colorfultab=0;se_terminate=0;se_site_onebox=0;zr_feed_cf=1;zr_km_material_buy=promotion;se_preset_tech=0;top_test_4_liguangyi=1;zr_video_rec=zr_video_rec:base;li_tjys_ec_ab=0;qa_web_answerlist_ad=0;se_auto_syn=0;se_movietab=0;tp_sticky_android=0;li_ts_sample=old;soc_bigone=0;tp_header_style=1;tp_qa_metacard_top=top;pf_creator_card=1;ug_follow_answerer_0=0;pf_newguide_vertical=0;li_album_liutongab=0;se_webrs=1;se_websearch=3;top_gr_ab=0",

"Connection": "keep-alive",

"Cookie": "_zap=e38dd652-6529-4c83-af54-a531884cffd4; _xsrf=2b497210-dceb-411f-aa7b-72ccb0b97150; d_c0=\"APAm6ZSGUw-PTtqr4VIKYh1yfYnb8PNgq2A=|1556083353\"; q_c1=8f3942059d7b42ba901f6dacd5d0f6ff|1556083354000|1556083354000; __gads=ID=09c485c5c2093ce7:T=1556087083:S=ALNI_MaSrBJOQyIcmbdmofil5ma_SFMFDA; tgw_l7_route=a37704a413efa26cf3f23813004f1a3b",

"TE": "Trailers"

}

response = requests.get(url, headers = headers, verify=False)

if response.status_code != 200:

print("download page error: status[%d]", response.status_code)

exit(1)

# response.content = b'U\xcb\xe0A\x15IKg\x0f\x80#5\xea\x8b\xd9\xbd\n\x8c\xdd\r\xb1\x8e\xeb\xf9>3\xb5\xca/\x89\x06s\xe0

text = response.text

html = BeautifulSoup(text, 'html.parser')

items = html.body.find_all(attrs={"class":"List-item"})

一键复制

编辑

Web IDE

原始数据

按行查看

历史

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值