insert into select from 大量数据_数据采集与融合第四次作业

作业①:

代码

items

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ScrapysqlItem():
    # define the fields for your item here like:
    # name = ()
    # define the fields for your item here like:
    pass

class BookItem():
    title = ()
    author = ()
    date = ()
    publisher = ()
    detail = ()
    price = ()

pipeline

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class BookPipeline:
    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            if self.opened:
                (
                    "insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values( % s, % s, % s, % s, % s, % s)",
                    (item["title"], item["author"], item["publisher"], item["date"], item["price"], item["detail"]))
                # 编写插入数据的sql
                 += 1
        except Exception as err:
            print(err)
        return item

    def open_spider(self, spider):
        print("opened")
        try:
            self.con = (host="", port=3306, user="root", passwd="383186", db="book_db", charset="utf8")
            # 创建连接
            self.cursor = self.con.cursor()
            # 创建游标对象
            ("delete from books")
            self.opened = True
             = 0
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            ()
            self.opened = False
            print("closed")
            print("总共爬取", , "本书籍")

爬虫

import scrapy
from ..items import BookItem
# from bs4 import BeautifulSoup
from bs4 import UnicodeDammit

class MySpider():
    name = "mySpider"
    key = 'python'
    source_url = ''

    def start_requests(self):
        url = self.source_url + "?key=" + self.key
        yield scrapy.Request(url=url,callback=)

    def parse(self, response):
        try:
            dammit = UnicodeDammit(, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = ("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title = ("./a[position()=1]/@title").extract_first()
                price = ("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = ("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date = ("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
                publisher = ("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
                detail = ("./p[@class='detail']/text()").extract_first()
                # detail有时没有，结果None
                item = BookItem()
                item["title"] = () if title else ""
                item["author"] = () if author else ""
                item["date"] = ()[1:] if date else ""
                item["publisher"] = () if publisher else ""
                item["price"] = () if price else ""
                item["detail"] = () if detail else ""
                yield item
            # print(item["title"])
            # 最后一页时link为None
            link = ("//div[@class='paging']/ul[@name='Fy']/li[@class='next'] / a / @ href").extract_first()
            if link:
                url = (link)
                yield scrapy.Request(url=url, callback=)
        except Exception as err:
            print(err)

settings和run

ITEM_PIPELINES = {
   '': 300,
}

from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())

结果

作业②

代码

items

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

#
# class EaststockItem():
#     # define the fields for your item here like:
#     # name = ()
#     pass

import scrapy

class EaststockItem():
    data = ()
    f12 = ()
    f14 = ()
    f2 = ()
    f3 = ()
    f4 = ()
    f5 = ()
    f6 = ()
    f7 = ()
    f15 = ()
    f16 = ()
    f17 = ()
    f18 = ()

pipeline

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


# class EaststockPipeline:
#     def process_item(self, item, spider):
#         return item
import pymysql

class EaststockPipeline:
    def process_item(self, item, spider):
        try:
            print(item["f12"])
            print(item["f14"])
            print(item["f2"])
            print(item["f3"])
            print(item["f4"])
            print(item["f5"])
            print(item["f6"])
            print(item["f7"])
            print(item["f15"])
            print(item["f16"])
            print(item["f17"])
            print(item["f18"])
            print()
            # print(type(item["f12"]))
            # print(type(item["f14"]))
            # print(type(item["f2"]))
            # print(type(item["f3"]))
            # print(type(item["f4"]))
            # print(type(item["f5"]))
            # print(type(item["f6"]))
            # print(type(item["f7"]))
            # print(type(item["f15"]))
            # print(type(item["f16"]))
            # print(type(item["f17"]))
            # print(type(item["f18"]))
            if self.opened:
                (
                    "insert into eastmoney (e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12) values( % s, % s, % s, % s, % s, % s,% s, % s, % s, % s, % s, % s)",
                    (item["f12"],item["f14"],item["f2"],item["f3"],item["f4"],item["f5"],item["f6"],item["f7"],item["f15"],item["f16"],item["f17"],item["f18"]))
                # 编写插入数据的sql
                 += 1
        except Exception as err:
            print(err)
        return item

    def open_spider(self, spider):
        print("opened")
        try:
            self.con = (host="", port=3306, user="root", passwd="383186", db="stock_db", charset="utf8")
            # 创建连接
            self.cursor = self.con.cursor()
            # 创建游标对象
            ("delete from eastmoney")
            self.opened = True
             = 0
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            ()
            self.opened = False
            print("closed")
            print("总共爬取", , "条股票信息")

爬虫

import scrapy
import re
import urllib.request
from bs4 import UnicodeDammit
from bs4 import BeautifulSoup
from ..items import EaststockItem

class StockSpider():
    name = 'stock'
    page = 1
    # allowed_domains = ['']
    start_urls = ['']

    def parse(self, response):
        while  <= 100:
            url = ""%()
            headers = {
                "User-Agent": "Mozilla/ (Windows NT 10.0; WOW64) AppleWebKit/ (KHTML, like Gecko) ""Chrome/70.0.3538.25 Safari/ Core/1.70.3775.400 QQBrowser/10.6.4209.400"}
            req = (url, headers=headers)
            data = (req)
            data = ()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, '')
            data = (r'"diff":\[(.*?)]', )
            datas = data[0].strip("{").strip("}").split('},{')  # 去掉头尾的"{"和"}"，再通过"},{"切片
            for i in range(len(datas)):
                item = EaststockItem()
                line = datas[i].replace('"', "").split(",")  # 去掉双引号并通过","切片
                item["f12"] = line[6][4:]
                item["f14"] = line[7][4:]
                item["f2"] = line[0][3:]
                item["f3"] = line[1][3:]
                item["f4"] = line[2][3:]
                item["f5"] = line[3][3:]
                item["f6"] = line[4][3:]
                item["f7"] = line[5][3:]
                item["f15"] = line[8][4:]
                item["f16"] = line[9][4:]
                item["f17"] = line[10][4:]
                item["f18"] = line[11][4:]
                yield item
             += 1

settings和run

ITEM_PIPELINES = {
   '': 300,
}

from scrapy import cmdline
cmdline.execute("scrapy crawl stock -s LOG_ENABLED=false".split())

运行结果

作业③

代码

items

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BankItem():
    # define the fields for your item here like:
    # name = ()
    Currency = ()
    TSP = ()
    CSP = ()
    TBP = ()
    CBP = ()
    bTime = ()

pipeline

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class BankPipeline:
    def open_spider(self, spider):
        print('opened')
        try:
            self.con = (host='localhost',port=3306,user='root',passwd='383186',db='bank_db',charset='utf8',autocommit =True)
            self.cursor = self.con.cursor()
            ('delete from foreignexchange')
            self.opened = True
             = 0
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            ()
            self.opened = False
        print('closed')
        print('总共爬取', , '种外汇')

    def process_item(self, item, spider):
        try:
            print(type(item["Currency"]))
            print(type(item["TSP"]))
            print(type(item["CSP"]))
            print(type(item["TBP"]))
            print(type(item["CBP"]))
            print(type(item["bTime"]))
            print()
            if self.opened:
                ("insert into foreignexchange (bCurrency,bTSP,bCSP,bTBP,bCBP,bTime) value (%s,%s,%s,%s,%s,%s)",(item['Currency'],item['TSP'],item['CSP'],item['TBP'],item['CBP'],item['bTime']))
                 += 1
        except Exception as err:
            print(err)
        return item

settings和run

ITEM_PIPELINES = {
   '': 300,
}

from scrapy import cmdline
cmdline.execute("scrapy crawl bank -s LOG_ENABLED=False".split())

爬虫

import scrapy
from ..items import BankItem
from scrapy.selector import Selector

class BankSpider():
    name = 'bank'
    start_urls = ['']

    def parse(self, response):
        try:
            data = .decode()
            selector = Selector(text=data)
            table = ("//table[@class='data']/tr")
            for t in table[1:]:
                currency = ("./td[position()=1]/text()").extract_first()
                tsp = ("./td[position()=4]/text()").extract_first()
                csp = ("./td[position()=5]/text()").extract_first()
                tbp = ("./td[position()=6]/text()").extract_first()
                cbp = ("./td[position()=7]/text()").extract_first()
                time = ("./td[position()=8]/text()").extract_first()
                item = BankItem()
                item["Currency"] = () if currency else ""
                item["TSP"] = () if tsp else ""
                item["CSP"] = () if csp else ""
                item["TBP"] = () if tbp else ""
                item["CBP"] = () if cbp else ""
                item["bTime"] = () if time else ""
                yield item
        except Exception as err:
            print(err)

数据库创建

结果

问题与解决(心得体会)

(1)程序执行时报错(1054, Unknown column bDetail in field list)

该列名在数据表中不存在，也就是SQL语句中的列名写错了。
数据表中的列名多了一个空格，解决办法就是将空格去掉就可以了。(我是这一种)
该列属于后面加上去的，选择全部列可以，单独获取报错，解决办法就是将数据表中的该列重新命名，然后保存。

(2)错误代码(1064，...)SQL语法错误

错误可能有很多( 我的错误：("insert into eastmoney ...)中列用“，”分隔，而不是空格)

(3)Python连接mysql,插入数据时不报错，但是没有插入进去

在 connect() 中加上参数 autocommit 并且给它赋值 True 这样就能自动检查是否真的入库了
con=(host=host_db,user=user_db,passwd=pwssword,db=name_db,port=port_db,charset='utf8',cursorclass=,autocommit =True)

(4)(1045,“Access denied for user 'root' @'localhost' (using password: YES)")

(5)text数据类型定义

不能只使用text()，括号需要加数字
TINYTEXT: 256 bytes
TEXT: 65,535 bytes => ~64kb
MEDIUMTEXT: 16,777,215 bytes => ~16MB
LONGTEXT: 4,294,967,295 bytes => ~4GB
MySQL Workbench里只有TINYTEXT和TEXT(),可用TEXT()自定义大小

posted @

2020-11-03 21:43 冰凉小手

阅读(

) 评论(

)

编辑收藏

insert into select from 大量数据_数据采集与融合第四次作业 - 冰凉小手

作业①:

代码

items

pipeline

爬虫

settings和run

结果

作业②

代码

items

pipeline

爬虫

settings和run

运行结果

作业③

代码

items

pipeline

settings和run

爬虫

数据库创建

结果

问题与解决(心得体会)

(1)程序执行时报错(1054, Unknown column bDetail in field list)

(2)错误代码(1064，...)SQL语法错误

(3)Python连接mysql,插入数据时不报错，但是没有插入进去

(4)(1045,“Access denied for user 'root' @'localhost' (using password: YES)")

(5)text数据类型定义