python爬取知乎发现文件存储[txt，json，csv，mongodb]

最新推荐文章于 2022-05-02 12:50:22 发布

嗨学编程

最新推荐文章于 2022-05-02 12:50:22 发布

阅读量391

点赞数

分类专栏： Python爬虫文章标签： python

原文链接：https://www.jianshu.com/p/4574354984f6

版权

Python爬虫专栏收录该内容

678 篇文章 330 订阅

订阅专栏

使用XPath

import requests
import json
from lxml import etree
from urllib import parse
'''
遇到不懂的问题？Python学习交流群：821460695满足你的需求，资料都已经上传群文件，可以自行下载！
'''
url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')
items ={}
for node in node_list:
    # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答
    answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]

    items = {
        "question" : question,
        "author" : author,
        "answer" : answer,
    } 

    with open("explore.json", "a") as f:
        #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")
        f.write(json.dumps(items, ensure_ascii = False) + "\n")

####保存为TXT

import requests
from lxml import etree
from urllib import parse
'''
遇到不懂的问题？Python学习交流群：821460695满足你的需求，资料都已经上传群文件，可以自行下载！
'''
url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')

for node in node_list:
    # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答
    answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]

    with open('explore.txt', 'a', encoding='utf-8') as file:
        file.write('\n'.join([question, author, answer]))
        file.write('\n' + '=' * 50 + '\n')

保存为csv

import requests
from lxml import etree
from urllib import parse
import csv

url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')

for node in node_list:
    # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答,为方便展示，只取部分内容,text[ :10]
    answer = node.xpath('.//*[@class="content"]')[0].text[ :10]

    #answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]


    with open('explore.csv', 'a', encoding='utf-8') as csvfile:
        fieldnames = ['question', 'author', 'answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow({'question': question, 'author': author, 'answer': answer})

####读取csv

1 import csv
2 
3 with open('explore.csv', 'r', encoding='utf-8') as csvfile:
4     reader = csv.reader(csvfile)
5     for row in reader:
6         print(row)

####保存到MongoDB

import requests
from lxml import etree
from urllib import parse
from pymongo import MongoClient

client = MongoClient()
db = client['explore']
collection = db['explore']

url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')

for node in node_list:
    # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答
    answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]

    items = {
        "question" : question,
        "author" : author,
        "answer" : answer,
    } 

    if collection.insert(items):
        print('Saved to Mongo')