insert into select from 大量数据_数据采集技术第四次作业

作业一

爬取当当网站图书数据

items.py

import scrapy
class BookItem():
    title =()
    author = ()
    date=()
    publisher=()
    detail=()
    price=()

MySpider.py

import scrapy
from ..items import BookItem
from bs4 import UnicodeDammit
class MySpider():
    name = "mySpider"
    key = 'python'
    source_url=''
    def start_requests(self):
        url = "?key="+MySpider.key
        yield scrapy.Request(url=url,callback=)
    def parse(self, response):
        try:
            dammit = UnicodeDammit(, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector=scrapy.Selector(text=data)
            lis=("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title=("./a[position()=1]/@title").extract_first()
                price=("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = ("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date =("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
                publisher = ("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
                detail = ("./p[@class='detail']/text()").extract_first()
                item=BookItem()
                item["title"]=() if title else ""
                item["author"]=() if author else ""
                item["date"] = ()[1:] if date else ""
                item["publisher"] = () if publisher else ""
                item["price"] = () if price else ""
                item["detail"] = () if detail else ""
                yield item
            link = ("//div[@class='paging']/ul[@name='Fy']/li[@class='next'] / a / @ href").extract_first()
            if link:
                url = (link)
                yield scrapy.Request(url=url, callback=)
        except Exception as err:
            print(err)

pipelines.py

from itemadapter import ItemAdapter
import pymysql
class BookPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="", port=3306, user="root", passwd="axx123123", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor()
            ("delete from money")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            ()
            self.opened = False
        print("closed")
    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            if self.opened:
                ("insert into money(Id,bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) "
                                    "values (%s,%s,%s,%s,%s,%s,%s)",
                                    (["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

运行截图

心得体会

sql语言也有课程在学，所以sql的部分代码也还算不错。

可以在pycharm里面直接查找数据库，但是如果语言报错，只会提示在哪里附近错了，不会提示报错的具体类型，眼神不好使的话就很难过了

作业二

Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

items.py

import scrapy
class ShareItem():
    id=()
    shareNumber=()
    shareName=()
    newestPrice=()
    changeRate=()
    changePrice=()
    turnover=()
    turnoverPrice=()
    amplitude=()
    highest=()
    lowest=()
    today=()
    yesterday=()
    pass

MySpider.py

import scrapy
from selenium import webdriver
from ..items import ShareItem
class MySpider():
    name = 'share'
    def start_requests(self):
        url = ''
        yield scrapy.Request(url=url, callback=)
    def parse(self, response):
        driver = ()
        try:
            ("")
            list=("//table[@id='table_wrapper-table'][@class='table_wrapper-table']/tbody/tr")
            for li in list:
                id=("./td[position()=1]")[0].text
                shareNumber=("./td[position()=2]/a")[0].text
                shareName=("./td[position()=3]/a")[0].text
                newestPrice=("./td[position()=5]/span")[0].text
                changeRate=("./td[position()=6]/span")[0].text
                changePrice =("./td[position()=7]/span")[0].text
                turnover =("./td[position()=8]")[0].text
                turnoverPrice =("./td[position()=9]")[0].text
                amplitude =("./td[position()=10]")[0].text
                highest =("./td[position()=11]/span")[0].text
                lowest =("./td[position()=12]/span")[0].text
                today =("./td[position()=13]/span")[0].text
                yesterday =("./td[position()=14]")[0].text
                print("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
                      %(id,shareNumber,shareName,newestPrice,changeRate,changePrice,
                        turnover,turnoverPrice,amplitude,highest,lowest,today,yesterday))
                item=ShareItem()
                item["id"]=id
                item["shareNumber"]=shareNumber
                item["shareName"]=shareName
                item["newestPrice"]=newestPrice
                item["changeRate"]=changeRate
                item["changePrice"]=changePrice
                item["turnover"]=turnover
                item["turnoverPrice"]=turnoverPrice
                item["amplitude"]=amplitude
                item["highest"]=highest
                item["lowest"]=lowest
                item["today"]=today
                item["yesterday"]=yesterday
                yield item
        except Exception as err:
            print(err)

pipelines.py

import pymysql
class SharePipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="", port=3306, user="root", passwd="axx123123", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor()
            ("delete from share")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            ()
            self.opened = False
        print("closed")
    def process_item(self, item, spider):
        try:
            if self.opened:
                (
                    "insert into share(Sid,Snumber,Sname,SnewestPrice,SchangeRate,SchangePrice,"
                    "Sturnover,SturnoverPrice,Samplitude,Shighest,Slowest,Stoday,Syesterday)"
                    "values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    (item["id"],item["shareNumber"],item["shareName"],item["newestPrice"],item["changeRate"],item["changePrice"],
                     item["turnover"],item["turnoverPrice"],item["amplitude"],item["highest"],item["lowest"],item["today"],item["yesterday"]))
        except Exception as err:
            print(err)
        return item

运行截图

心得体会

算是第一次接触selenium，比较麻烦的还得用firefox，组装的过程比较麻烦(所以数据表的创建就很笼统的全部是字符串类型)，内容和上次实验一样，但是寻找数据方面上花的时间少了许多，selenium+xpath寻找到对应数据的速度比上次快,而且处理上面也省了许多事(firefox自带查找太香了，孩子很开心下次还用)

作业三

使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

items.py

import scrapy
class MoneyItem():
    id=()
    currency=()
    tsp=()
    csp=()
    tbp=()
    cbp=()
    time=()
    pass

MySpider.py

import scrapy
from ..items import MoneyItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
class MySpider():
    name = "mySpider"
    source_url=''
    def start_requests(self):
        url = MySpider.source_url
        yield scrapy.Request(url=url,callback=)
    def parse(self, response):
        try:
            dammit = UnicodeDammit(, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector=scrapy.Selector(text=data)
            count=1
            list=("//table[@cellspacing='1']/tr")
            idn=1
            for li in list:
                if count!=1:
                    id=idn
                    currency=("./td[@class='fontbold']/text()").extract_first().strip()
                    tsp=("./td[@class='numberright'][position()=1]/text()").extract_first().strip()
                    csp=("./td[@class='numberright'][position()=2]/text()").extract_first().strip()
                    tbp=("./td[@class='numberright'][position()=3]/text()").extract_first().strip()
                    cbp=("./td[@class='numberright'][position()=4]/text()").extract_first().strip()
                    time=("./td[@align='center'][position()=3]/text()").extract_first().strip()
                    item=MoneyItem()
                    item["id"] = id if id else ""
                    item["currency"] = () if currency else ""
                    item["tsp"] = tsp if tsp else ""
                    item["csp"] = csp if csp else ""
                    item["tbp"] = tbp if tbp else ""
                    item["cbp"] = cbp if cbp else ""
                    item["time"] = time if time else ""
                    yield item
                    idn=idn+1
                count=count+1
        except Exception as err:
            print(err)

pipelines.py

from itemadapter import ItemAdapter
import pymysql
class MoneyPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="", port=3306, user="root", passwd="axx123123", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor()
            ("delete from money")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            ()
            self.opened = False
        print("closed")
    def process_item(self, item, spider):
        try:
            if self.opened:
                ("insert into money(Mid,Mcurrency,Mtsp,Mcsp,Mtbp,Mcbp,Mtime) "
                                    "values (%s,%s,%s,%s,%s,%s,%s)",
                                    (item["id"], item["currency"], item["tsp"], item["csp"], item["tbp"], item["tsp"],item["time"]))
        except Exception as err:
            print(err)
        return item

运行截图