import requests from pyquery import PyQuery as pq def getHtml(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } r = requests.get(url,headers=headers) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except requests.RequestException as e: return e def parseHtml(html): doc = pq(html) items = doc('.explore-tab.feed-item').items() for item in items: question = item.find('h2').text() author = item.find('.author-link-line').text() answer = pq(item.find('.content').html()).text() with open('explore2.txt','a',encoding='utf-8') as f: f.write('\n'.join([question,author,answer])) f.write('\n' + '=' * 50 + '\n') def main(): url = "https://www.zhihu.com/explore" html =getHtml(url) parseHtml(html) main()
知乎 爬虫
最新推荐文章于 2024-04-07 09:36:35 发布