mysql完整代码案例_python爬贴吧数据存mysql完整代码案例

import requests

import parsel # pip install parsel

import urllib.request

import urllib.parse

import re

import json

import pymysqlfrompymysql.cursors import DictCursor

header=\

{'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}

file= open('电脑吧数据.txt', 'w', encoding='utf-8')

# 爬取贴吧数据

def spider(startpage ,endpage, pagesize):

page_num= 0# range 左包右不包for page in range(startpage, endpage + 1, pagesize):

page_num+= 1print('===================正在抓取贴吧的第{}页数据==================='.format(page_num))

url= 'https://tieba.baidu.com/f?kw=%E7%94%B5%E8%84%91&ie=utf-8&pn={}'.format(page)

page_data(url)

# 解析贴吧主页

def page_data(url):

request= urllib.request.Request(url=url, headers=header)

response=urllib.request.urlopen(request)

html= response.read().decode('utf-8')

# 解析帖子地址

thread_ids= re.findall(r'href="/p/(\d+)"', html)

# thread_urls= ['http://tieba.baidu.com/p/' + str(url) for url inthread_ids]

#for url inthread_urls:for thread_id inthread_ids:

parser_thread(thread_id)

# 解析帖子内容

def parser_thread(thread_id):

thread_url= 'http://tieba.baidu.com/p/' +str(thread_id)

#print('id', thread_id)

print('thread_url', thread_url)

# 解析帖子第一页数据,获取帖子总页数

response= requests.get(thread_url, headers=header).text

response_data=parsel.Selector(response)

# 标题

thread_title= response_data.xpath('//h1/text()').extract()[0]

# 发帖时间

content_field=\

response_data.xpath('//div[contains(@class,"l_post j_l_post l_post_bright")]/@data-field').extract()

content_field_json= json.loads(content_field[0])

publish_date= content_field_json['content']['date']

# 楼主昵称 ps:如果名字中有图片/字符可能导致不完整

thread_author= content_field_json['author']['user_name']

# 楼主头像地址

avatar_url= 'https:' + response_data.xpath('//ul/li/div/a/img/@src').extract()[0]

# 帖子总回复数

thread_reply_count= response_data.xpath('//li[@class="l_reply_num"]/span/text()').extract()[0]

# 帖子总页数

thread_page_count= int(response_data.xpath('//li[@class="l_reply_num"]/span/text()').extract()[1])

# print('----------------------------------------\n')

# print('id:', thread_id)

# print('链接:', thread_url)

# print('标题:', thread_title)

# print('日期:', publish_date)

# print('作者:', thread_author)

# print('头像:', avatar_url)

# 保存贴子主数据

save_thread(thread_id, thread_title, thread_author, publish_date, avatar_url)

# print('帖子总页数:{0},帖子总回复数:{1}'.format(thread_page_count,thread_reply_count))

#for page_index in range(0, thread_page_count+1):

# page_url= thread_url+"?pn={}".format(page_index+1)

# parser_thread_detail(thread_url)

# 帖子内容集合

thread_contents= response_data.xpath('.//div[contains(@id,"post_content_")]')

# index 楼层

index= 0

while index

# 楼层文案

content_text= thread_contents.xpath('string(.)').extract()[index]

# 楼层前面空格去除

content_text= content_text[12:]

field_json=json.loads(content_field[index])

detail_publish_date= field_json['content']['date']

thread_detail_id= field_json['content']['post_id']

# 该层的Selector

content_sel=thread_contents[index]

# 获取该层图片

images= content_sel.xpath('img/@src').extract()

index= index + 1print('第{}楼'.format(index))

# print('文案:', content_text)

save_thread_detail(thread_detail_id, thread_id, content_text, str(images), detail_publish_date)

# thread_images= response_data.xpath('//cc/div/img[@class="BDE_Image"]/@src').extract()

# saveImg(thread_images)

# 保存贴子主数据

def save_thread(thread_id, thread_title, nickname, publish_time, avatar_url):

# SQL 插入语句

sql= 'insert into thread_info(thread_id, thread_title, nickname, publish_time, avatar_url)'\'value (%s, %s, %s, %s, %s )'

try:

conn=pymysql.connect(

host='47.101.213.133', # 连接名

port=3306, # 端口

user='dreaming', # 用户名

password='30wish2003!', # 密码

charset='utf8', # 不能写utf-8 在MySQL里面写utf-8会报错

database='x_player', # 数据库库名

cursorclass=DictCursor)

# 使用cursor()方法获取操作游标

cursor=conn.cursor()

# 执行sql语句

r=cursor.execute(sql, (thread_id, thread_title, nickname, publish_time, avatar_url))

# r=cursor.execute(sql)

# 提交到数据库执行

conn.commit()

print('save success -', r)

except:

# 发生错误时回滚

print('ERROR -', thread_id)

# 关闭数据库连接

cursor.close()

conn.close()

# 保存每个楼层输入(只爬取贴子的第一页楼层数据)

def save_thread_detail(thread_detail_id, thread_id, content, image, publish_date):

# SQL 插入语句

sql= 'insert into thread_detail_info(thread_detail_id, thread_id, content, image, publish_date)'\'value (%s, %s, %s, %s, %s )'

try:

conn=pymysql.connect(

host='xx.xxx.xxx.xxx', # TODO:连接名

port=3306, # TODO:端口

user='xxx', # TODO:用户名

password='xxx!', # TODO:密码

charset='utf8', # 不能写utf-8 在MySQL里面写utf-8会报错

database='xxx', #TODO:数据库库名

cursorclass=DictCursor)

# 使用cursor()方法获取操作游标

cursor=conn.cursor()

# 执行sql语句

r=cursor.execute(sql, (thread_detail_id, thread_id, content, image, publish_date))

# 提交到数据库执行

conn.commit()

print('save detail success -', r)

except:

print('!!!!!!!save detail error:-', thread_detail_id)

# 关闭数据库连接

cursor.close()

conn.close()

# 将数据保存到txt文件

def savefile(data):for item indata:

file.write('----------------------------------------\n')

file.write('title:' + str(item[0]) + '\n')

file.write('author:' + str(item[1]) + '\n')

file.write('url:' + str(item[2]) + '\n')

file.write('images:' + str(item[3]) + '\n')

# 图片下载到本地/服务器

def saveImg(images):for img inimages:

img_data= requests.get(img, headers=header).content # 二进制数据用content

image_name= img.split('/')[-1]

with open('./tieba/' + image_name, 'wb') asf:

f.write(img_data)

print('%s download img...' %image_name)if __name__ == '__main__':

start= int(input("输入开始爬取贴吧的页码:"))

end= int(input('输入结束爬取贴吧的页码(默认请输入0):'))

end=end+1 if end !=0 else 3057000 + 1;

spider(start,end,50)

当然可以!以下是一个简单的示例代码,用于将爬取到的数据储到 MySQL 数据库中: ```python # 导入所需的库 import pymysql # 连接到 MySQL 数据库 connection = pymysql.connect( host='localhost', user='your_username', password='your_password', db='your_database_name', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) # 创建表(如果不在) create_table_query = ''' CREATE TABLE IF NOT EXISTS `spider_data` ( `id` INT AUTO_INCREMENT PRIMARY KEY, `title` VARCHAR(255), `content` TEXT ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; ''' with connection.cursor() as cursor: cursor.execute(create_table_query) connection.commit() # 将爬取到的数据插入到数据库中 def insert_data(title, content): insert_query = 'INSERT INTO `spider_data` (`title`, `content`) VALUES (%s, %s)' with connection.cursor() as cursor: cursor.execute(insert_query, (title, content)) connection.commit() # 虫代码部分(这里只是简单示例) def spider(): # 爬取数据的过程... title = "示例标题" content = "示例内容" # 将数据插入数据库 insert_data(title, content) # 执行虫代码 spider() # 关闭数据库连接 connection.close() ``` 请注意,这只是一个简单的示例,实际情况可能会根据你的具体需求而有所变化。确保你已经安装了 `pymysql` 库,并根据你的实际情况修改连接数据库的参数、创建表的语句和虫代码部分。希望对你有所帮助!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值