实战 pyton多进程爬取知乎合集页面并保存
目标:
爬取知乎合集赞同超过10k的回答下的所有回答。
1.存入mongoDB数据库
2.以markdown存入文件夹
代码:
#-*- coding:utf-8 -*-
import requests
from lxml import etree
import html2text
from multiprocessing import Pool
import time
import os
import pymongo
import re
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.pachong
collection = db.zhihu10k
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
'referer': 'https://www.zhihu.com/collection/38887091'
}
def get_one_page(url,page):
response=requests.get(url,headers=headers)
html=etree.HTML(response.content)
titles=html.xpath('//h2[@class="zm-item-title"]/a/text()')
authors=html.xpath('//div[@class="zm-item-rich-text expandable js-collapse-body"]/@data-author-name')
votecounts=