有手就行的python爬虫实战

最新推荐文章于 2024-06-19 17:27:45 发布

hangshao0.0

最新推荐文章于 2024-06-19 17:27:45 发布

阅读量1.1k

点赞数 2

分类专栏： python

本文链接：https://blog.csdn.net/weixin_45254208/article/details/112642858

版权

python 专栏收录该内容

36 篇文章 1 订阅

订阅专栏

该代码示例展示了如何使用Python的requests和PyQuery库爬取知乎探索页面的内容，包括专题分类、文章标签和文章标题，并分别存储为txt、json、csv文件以及存入MySQL数据库。爬取内容后，对数据进行格式化输出，提高可读性。存储部分涵盖了文本、JSON、CSV和数据库四种常见方式，确保数据的有效保存。

摘要由CSDN通过智能技术生成

爬取内容

import requests
from pyquery import PyQuery as pq

category = []   # 存储分类名称
tag = []        # 存储文章标签
title = []      # 存储文章标题
url = 'https://www.zhihu.com/explore'       # 目标URL
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36')
    # 一行写不下的话可以使用括号() ，或者在第一行末尾加上' \'
}
html = requests.get(url, headers=headers).text
doc = pq(html)

# 根据属性名称获取对应的item
item1 = doc('.ExploreSpecialCard-info').items()
item2 = doc('.ExploreSpecialCard-contentTag').items()
item3 = doc('.ExploreSpecialCard-contentTitle').items()

for i in item1:
    category.append(i.text())
for i in item2:
    tag.append(i.text())
for i in item3:
    title.append(i.text())
# 打印结果
print(category)
print(tag)
print(title)

显然，内容获取成功，但是排版不太美观，那就稍做处理，把item1的属性改一下，然后美化输出结果。

import requests
from pyquery import PyQuery as pq

category = []   # 存储分类名称
tag = []        # 存储文章标签
title = []      # 存储文章标题
url = 'https://www.zhihu.com/explore'       # 目标URL
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36')
    # 一行写不下的话可以使用括号() ，或者在第一行末尾加上' \'
}
html = requests.get(url, headers=headers).text
doc = pq(html)

# 根据属性名称获取对应的item
item1 = doc('.ExploreSpecialCard-title').items()
item2 = doc('.ExploreSpecialCard-contentTag').items()
item3 = doc('.ExploreSpecialCard-contentTitle').items()

for i in item1:
    category.append(i.text())
for i in item2:
    tag.append(i.text())
for i in item3:
    title.append(i.text())

# 打印结果
for i in range(len(category)):
    print('专题分类：', category[i])
    for j in range(3):
        print('标签：{}'.format(tag[i*3+j]), end=' ---> ')
        print('标题：{}'.format(title[i*3+j]))
    print('=' * 30)

存储爬取的内容

txt文本存储

import requests
from pyquery import PyQuery as pq

category = []   # 存储分类名称
tag = []        # 存储文章标签
title = []      # 存储文章标题
url = 'https://www.zhihu.com/explore'       # 目标URL
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36')
    # 一行写不下的话可以使用括号() ，或者在第一行末尾加上' \'
}
html = requests.get(url, headers=headers).text
doc = pq(html)

# 根据属性名称获取对应的item
item1 = doc('.ExploreSpecialCard-title').items()
item2 = doc('.ExploreSpecialCard-contentTag').items()
item3 = doc('.ExploreSpecialCard-contentTitle').items()

for i in item1:
    category.append(i.text())
for i in item2:
    tag.append(i.text())
for i in item3:
    title.append(i.text())

# r:只读  w:覆盖写入  a:追加写入  b:二进制形式
file = open('explore.txt', 'w', encoding='utf-8')

# 打印结果
for i in range(len(category)):
    file.write(''.join(category[i]))
    file.write('\n')
    for j in range(3):
        file.write(''.join([tag[i*3+j]]))
        file.write('  --->  ')
        file.write(''.join([title[i*3+j]]))
        file.write('\n')
    file.write('\n' + '='*50 + '\n')

file.close()

json文件存储

json的字符串要用双引号包围，不能用单引号。

import json
data = [{
	'name': '航少',
	'gender': '男',
	'age': 21
}]
with open('data.json', 'w', encoding='utf-8') as file:
    file.write(json.dumps(data, indent=2, ensure_ascii=False))

# loads()与dumps()相对
# indent=2表示json数据缩进的字符个数为2
# encoding 加上 ensure_ascii 用于输出中文

csv文件存储

import requests
import csv
import codecs
from pyquery import PyQuery as pq

category = []   # 存储分类名称
tag = []        # 存储文章标签
title = []      # 存储文章标题
url = 'https://www.zhihu.com/explore'       # 目标URL
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36')
    # 一行写不下的话可以使用括号() ，或者在第一行末尾加上' \'
}
html = requests.get(url, headers=headers).text
doc = pq(html)

# 根据属性名称获取对应的item
item1 = doc('.ExploreSpecialCard-title').items()
item2 = doc('.ExploreSpecialCard-contentTag').items()
item3 = doc('.ExploreSpecialCard-contentTitle').items()

for i in item1:
    category.append(i.text())
for i in item2:
    tag.append(i.text())
for i in item3:
    title.append(i.text())


# r:只读  w:覆盖写入  a:追加写入  b:二进制形式
'''
python3中，write(codecs.BOM_UTF8)必须在以二进制打开文件时才可用
而二进制与我们拿到的数据格式(字符串)不符合
所以将文件分两次打开，第一次改形式防止乱码（用二进制的方式），第二次写数据（用非二进制方式）
'''

# 第一次打开时，解决中文乱码问题
file1 = open('explore.csv', 'wb')
file1.write(codecs.BOM_UTF8)

# 第二次打开时，再写入数据
file2 = open('explore.csv', 'w', newline='')
csv_file = csv.writer(file2)

# 写入数据
csv_file.writerow(['专题分类', '标签', '标题'])
for i in range(len(category)):
    for j in range(3):
        csv_file.writerow([category[i], tag[i*3+j], title[i*3+j]])

file1.close()
file2.close()

如果不分两次打开文件，就会总是报错。

数据库存储

import requests
import pymysql
from pyquery import PyQuery as pq


category = []   # 存储分类名称
tag = []        # 存储文章标签
title = []      # 存储文章标题
url = 'https://www.zhihu.com/explore'       # 目标URL
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36')
    # 一行写不下的话可以使用括号() ，或者在第一行末尾加上' \'
}
html = requests.get(url, headers=headers).text
doc = pq(html)

# 根据属性名称获取对应的item
item1 = doc('.ExploreSpecialCard-title').items()
item2 = doc('.ExploreSpecialCard-contentTag').items()
item3 = doc('.ExploreSpecialCard-contentTitle').items()

for i in item1:
    category.append(i.text())
for i in item2:
    tag.append(i.text())
for i in item3:
    title.append(i.text())

# 连接数据库
db = pymysql.connect(host='localhost', user='root', password='root', port=3306, db='test', charset='utf8')
cursor = db.cursor()
sql1 = 'create table explore (catagory varchar(256) not null, tag varchar(256) not null, title varchar(256) not null)'
sql2 = 'insert into explore (catagory, tag, title) values(%s, %s, %s)'

# 保持事务的一致性
try:
    cursor.execute(sql1)    # 创建数据表
    for i in range(len(category)):
        for j in range(3):
            cursor.execute(sql2, (category[i], tag[i*3+j], title[i*3+j]))
    db.commit()
except Exception as e:
    print('报错信息：', e)
    db.rollback()

db.close()

其他数据库存储

其他的数据库的操作和MySQL差不多，无非具体函数有所差别。

hangshao0.0

关注

2
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
有手就行的python爬虫实战

目录爬取内容存储爬取的内容txt文本存储json文件存储csv文件存储数据库存储其他数据库存储爬取内容import requestsfrom pyquery import PyQuery as pqcategory = [] # 存储分类名称tag = [] # 存储文章标签title = [] # 存储文章标题url = 'https://www.zhihu.com/explore' # 目标URLheaders = { 'User-A
复制链接

扫一扫

专栏目录