python之知乎的正确打开方式

学习链接:https://zhuanlan.zhihu.com/p/28680797

 

主要呢是用了chrome driver去模拟滚动和点击

同样是用了beautiSoup去解析html文件

解析出图片的url之后转码之后下载图片

 

代码:

 1 # -*- coding:utf-8 -*-
 2 
 3 import sys
 4 import time
 5 import urllib
 6 from bs4 import BeautifulSoup
 7 from HTMLParser import HTMLParser
 8 from selenium import webdriver
 9 
10 reload(sys)
11 sys.setdefaultencoding("utf-8")
12 
13 def main():
14 
15     #用chrome driver打开页面
16     driver = webdriver.Chrome()
17     driver.get("https://www.zhihu.com/question/54104076")
18 
19     #用driver进行点击操作并模拟点击查看更多
20     def execute_times(times):
21 
22         for i in range(times + 1):
23             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
24             time.sleep(2)
25             try:
26                 driver.find_element_by_css_selector('button.QuestionMainAction').click()
27                 print "page"+str(i)
28                 time.sleep(1)
29             except :
30                 break
31     execute_times(3) #打开三个页面
32 
33     #结构化HTML文件并用beautifulSoup解析
34     result_raw=driver.page_source
35     result_soup = BeautifulSoup(result_raw, 'html.parser')
36     result_bf = result_soup.prettify()
37 
38     #找到所有图片
39     with open("D:\\imagine\\raw_result.txt", 'w') as girls:
40         girls.write(result_bf)
41     girls.close()
42     print "Store raw data successfully!!!"
43     
44     #找到所有<nonscript>标签并记录下来
45     with open("D:\\imagine\\noscript_meta.txt", 'w') as noscript_meta:
46         noscript_nodes = result_soup.find_all('noscript')
47         noscript_inner_all = ""
48         for noscript in noscript_nodes:
49             noscript_inner = noscript.get_text()
50             noscript_inner_all += noscript_inner + "\n"
51 
52         h = HTMLParser()
53         noscript_all = h.unescape(noscript_inner_all)
54         noscript_meta.write(noscript_all)
55     noscript_meta.close()
56     print "Store noscript meta data successfully!!!" #成功找到图片
57 
58     #开始下载图片
59     img_soup = BeautifulSoup(noscript_all, 'html.parser')
60     img_nodes = img_soup.find_all('img')
61     with open("D:\\imagine\\img_meta.txt", 'w') as img_meta:
62         count = 0
63         for img in img_nodes:
64             if img.get('src') is not None:
65                 img_url = img.get('src')
66 
67                 line = str(count) + "\t" + img_url + "\n"
68                 img_meta.write(line)
69                 urllib.urlretrieve(img_url, "D:\\imagine\\" + str(count) + ".jpg") #下载图片
70                 count += 1
71 
72     img_meta.close()
73     print "Store meta data and images successfully!!!" #成功存储图片
74 
75 if __name__ == '__main__':
76     main()

 

 

结果图:

 

转载于:https://www.cnblogs.com/general10/p/7530388.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值