python爬取知乎发现文件存储[txt,json,csv,mongodb]

使用XPath

import requests
import json
from lxml import etree
from urllib import parse
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')
items ={}
for node in node_list:
    # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答
    answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]

    items = {
        "question" : question,
        "author" : author,
        "answer" : answer,
    } 

    with open("explore.json", "a") as f:
        #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")
        f.write(json.dumps(items, ensure_ascii = False) + "\n")

image.png
####保存为TXT

import requests
from lxml import etree
from urllib import parse
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')

for node in node_list:
    # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答
    answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]

    with open('explore.txt', 'a', encoding='utf-8') as file:
        file.write('\n'.join([question, author, answer]))
        file.write('\n' + '=' * 50 + '\n')

image.png

保存为csv
import requests
from lxml import etree
from urllib import parse
import csv

url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')

for node in node_list:
    # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答,为方便展示,只取部分内容,text[ :10]
    answer = node.xpath('.//*[@class="content"]')[0].text[ :10]

    #answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]


    with open('explore.csv', 'a', encoding='utf-8') as csvfile:
        fieldnames = ['question', 'author', 'answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow({'question': question, 'author': author, 'answer': answer})

####读取csv

1 import csv
2 
3 with open('explore.csv', 'r', encoding='utf-8') as csvfile:
4     reader = csv.reader(csvfile)
5     for row in reader:
6         print(row)

image
####保存到MongoDB

import requests
from lxml import etree
from urllib import parse
from pymongo import MongoClient

client = MongoClient()
db = client['explore']
collection = db['explore']

url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')

for node in node_list:
    # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    #问题
    question = node.xpath('.//h2/a')[0].text.replace("\n","")
    # 作者
    author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
    # 回答
    answer = node.xpath('.//*[@class="content"]')[0].text
    #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]

    items = {
        "question" : question,
        "author" : author,
        "answer" : answer,
    } 

    if collection.insert(items):
        print('Saved to Mongo')

image

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值