python版操作mysql数据库

python语句完成scrapy中的提交数据到mysql

from tqdm import tqdm
from colorama import init, Fore
from icon.word import show
from fake_useragent import UserAgent
from requests.exceptions import RequestException
import requests
import pymysql
import time
import re

def connect_db():
    """
    连接Mysql数据库
    :return: db
    """
    db = pymysql.connect(host='localhost', user='root', password='******', port=3306, db='36kr')
    # print('数据库连接成功!')
    return db

def save_to_mysql(datas):
    """
    将解析数据存储到Mysql数据库
   :param item: 获取的单页有效数据
    :return: None
    """
    # 连接数据库
    db = connect_db()
    # 获得Mysql操作指针
    cursor = db.cursor()
    # sql
    sql = "INSERT INTO kr(id, article_title, colum_name, colum_id, cover, publish_time, summary) " \
          "VALUES(%s, %s, %s, %s, %s, %s, %s)"
    for _item in datas:
        try:
            # 插入数据
            cursor.execute(sql, (_item['id'], _item['title'], _item['column_name'],
                                 _item['column_id'], _item['cover'], _item['publish_time'], _item['summary']))
            # 提交
            db.commit()
            # print('数据插入成功!')
        except Exception as e:
            # print('数据插入失败!',e)
            db.rollback()
    # 关闭数据库连接
    db.close()

def main():
    for i in tqdm(range(10),  desc='抓取进度'):
        # 获取
        items = get_one_page(i+1)
        # 解析
        data = parse_one_page(items)
        # 保存
        save_to_mysql(data)
        time.sleep(1)

具体解析解析创建连接如下:

def connect_db():
    """
    连接Mysql数据库
    :return: db
    """
    db = pymysql.connect(host='localhost', user='root', password='******', port=3306, db='36kr')
    # print('数据库连接成功!')
    return db

插入数据如下:

def save_to_mysql(datas):
    """
    将解析数据存储到Mysql数据库
   :param item: 获取的单页有效数据
    :return: None
    """
    # 连接数据库
    db = connect_db()
    # 获得Mysql操作指针
    cursor = db.cursor()
    # sql
    sql = "INSERT INTO kr(id, article_title, colum_name, colum_id, cover, publish_time, summary) " \
          "VALUES(%s, %s, %s, %s, %s, %s, %s)"
    for _item in datas:
        try:
            # 插入数据
            cursor.execute(sql, (_item['id'], _item['title'], _item['column_name'],
                                 _item['column_id'], _item['cover'], _item['publish_time'], _item['summary']))
            # 提交
            db.commit()
            # print('数据插入成功!')
        except Exception as e:
            # print('数据插入失败!',e)
            db.rollback()
    # 关闭数据库连接
    db.close()

接下来是一个完整的实例:

# -*- coding:utf-8 -*-

"""
Created at 21:04 at March 12,2019
@title: 爬取36kr的最新文章信息并保存至Mysql数据库
@author: Northxw
"""

from tqdm import tqdm
from colorama import init, Fore
from icon.word import show
from fake_useragent import UserAgent
from requests.exceptions import RequestException
import requests
import pymysql
import time
import re

init(autoreset=True)

def connect_db():
    """
    连接Mysql数据库
    :return: db
    """
    db = pymysql.connect(host='localhost', user='root', password='******', port=3306, db='36kr')
    # print('数据库连接成功!')
    return db

def get_one_page(page):
    """
    获取一页的最新文章JSON数据
    :param page: 页码
    :return: json
    """
    # 真实请求
    url = 'https://36kr.com/api/search-column/mainsite?per_page=20&page={}'.format(str(page))
    # 设置Headers
    headers = {
        'User-Agent': UserAgent().random,
        'Referer': 'https://36kr.com/',
        'Host': '36kr.com'
    }
    # 获取网页源代码
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            items = response.json()['data']['items']
            return items
        return None
    except RequestException:
        return None

def parse_one_page(items):
    """
    解析获取的JSON数据
    :param items: 获取的JSON数据段items
    :return: dict
    """
    # 存储单页总数据
    datas = list()
    for item in items:
        data= {
            # 文章ID
            'id': str(item['id']),
            # 标题
            'title': item['title'],
            # 类别
            'column_name': item['column_name'],
            # id
            'column_id': item['column_id'],
            # 封面图片链接
            'cover': item['cover'],
            # 发布时间
            'publish_time': item['published_at'] ,
            # 文章总结
            'summary': item['summary']
        }
        # 处理时间
        data['publish_time'] = re.search('(.*?)T(.*?)\+.*', data['publish_time']).group(1) + ' ' + re.search('(.*?)T(.*?)\+.*', data['publish_time']).group(2)
        # 存储
        datas.append(data)
        # 将标题写入文件.制作中文词云
        with open('./icon/36kr.txt', 'a', encoding='utf-8') as f:
            f.write(data['title'])
    return datas

def save_to_mysql(datas):
    """
    将解析数据存储到Mysql数据库
   :param item: 获取的单页有效数据
    :return: None
    """
    # 连接数据库
    db = connect_db()
    # 获得Mysql操作指针
    cursor = db.cursor()
    # sql
    sql = "INSERT INTO kr(id, article_title, colum_name, colum_id, cover, publish_time, summary) " \
          "VALUES(%s, %s, %s, %s, %s, %s, %s)"
    for _item in datas:
        try:
            # 插入数据
            cursor.execute(sql, (_item['id'], _item['title'], _item['column_name'],
                                 _item['column_id'], _item['cover'], _item['publish_time'], _item['summary']))
            # 提交
            db.commit()
            # print('数据插入成功!')
        except Exception as e:
            # print('数据插入失败!',e)
            db.rollback()
    # 关闭数据库连接
    db.close()

def main():
    """
    主函数
    :return: None
    """
    print(Fore.RED + '提示:截止目前的总数据量是77998条, 测试仅抓取前10页的共200条数据!\n')
    for i in tqdm(range(10),  desc='抓取进度'):
        # 获取
        items = get_one_page(i+1)
        # 解析
        data = parse_one_page(items)
        # 保存
        save_to_mysql(data)
        time.sleep(1)

if __name__ == '__main__':
    main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值