作业①:
代码
items
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapysqlItem():
# define the fields for your item here like:
# name = ()
# define the fields for your item here like:
pass
class BookItem():
title = ()
author = ()
date = ()
publisher = ()
detail = ()
price = ()
pipeline
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class BookPipeline:
def process_item(self, item, spider):
try:
print(item["title"])
print(item["author"])
print(item["publisher"])
print(item["date"])
print(item["price"])
print(item["detail"])
print()
if self.opened:
(
"insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values( % s, % s, % s, % s, % s, % s)",
(item["title"], item["author"], item["publisher"], item["date"], item["price"], item["detail"]))
# 编写插入数据的sql
+= 1
except Exception as err:
print(err)
return item
def open_spider(self, spider):
print("opened")
try:
self.con = (host="", port=3306, user="root", passwd="383186", db="book_db", charset="utf8")
# 创建连接
self.cursor = self.con.cursor()
# 创建游标对象
("delete from books")
self.opened = True
= 0
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
self.con.commit()
()
self.opened = False
print("closed")
print("总共爬取", , "本书籍")
爬虫
import scrapy
from ..items import BookItem
# from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
class MySpider():
name = "mySpider"
key = 'python'
source_url = ''
def start_requests(self):
url = self.source_url + "?key=" + self.key
yield scrapy.Request(url=url,callback=)
def parse(self, response):
try:
dammit = UnicodeDammit(, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
lis = ("//li['@ddt-pit'][starts-with(@class,'line')]")
for li in lis:
title = ("./a[position()=1]/@title").extract_first()
price = ("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
author = ("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
date = ("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
publisher = ("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
detail = ("./p[@class='detail']/text()").extract_first()
# detail有时没有,结果None
item = BookItem()
item["title"] = () if title else ""
item["author"] = () if author else ""
item["date"] = ()[1:] if date else ""
item["publisher"] = () if publisher else ""
item["price"] = () if price else ""
item["detail"] = () if detail else ""
yield item
# print(item["title"])
# 最后一页时link为None
link = ("//div[@class='paging']/ul[@name='Fy']/li[@class='next'] / a / @ href").extract_first()
if link:
url = (link)
yield scrapy.Request(url=url, callback=)
except Exception as err:
print(err)
settings和run
ITEM_PIPELINES = {
'': 300,
}
from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())
结果
作业②
代码
items
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
#
# class EaststockItem():
# # define the fields for your item here like:
# # name = ()
# pass
import scrapy
class EaststockItem():
data = ()
f12 = ()
f14 = ()
f2 = ()
f3 = ()
f4 = ()
f5 = ()
f6 = ()
f7 = ()
f15 = ()
f16 = ()
f17 = ()
f18 = ()
pipeline
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
# class EaststockPipeline:
# def process_item(self, item, spider):
# return item
import pymysql
class EaststockPipeline:
def process_item(self, item, spider):
try:
print(item["f12"])
print(item["f14"])
print(item["f2"])
print(item["f3"])
print(item["f4"])
print(item["f5"])
print(item["f6"])
print(item["f7"])
print(item["f15"])
print(item["f16"])
print(item["f17"])
print(item["f18"])
print()
# print(type(item["f12"]))
# print(type(item["f14"]))
# print(type(item["f2"]))
# print(type(item["f3"]))
# print(type(item["f4"]))
# print(type(item["f5"]))
# print(type(item["f6"]))
# print(type(item["f7"]))
# print(type(item["f15"]))
# print(type(item["f16"]))
# print(type(item["f17"]))
# print(type(item["f18"]))
if self.opened:
(
"insert into eastmoney (e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12) values( % s, % s, % s, % s, % s, % s,% s, % s, % s, % s, % s, % s)",
(item["f12"],item["f14"],item["f2"],item["f3"],item["f4"],item["f5"],item["f6"],item["f7"],item["f15"],item["f16"],item["f17"],item["f18"]))
# 编写插入数据的sql
+= 1
except Exception as err:
print(err)
return item
def open_spider(self, spider):
print("opened")
try:
self.con = (host="", port=3306, user="root", passwd="383186", db="stock_db", charset="utf8")
# 创建连接
self.cursor = self.con.cursor()
# 创建游标对象
("delete from eastmoney")
self.opened = True
= 0
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
self.con.commit()
()
self.opened = False
print("closed")
print("总共爬取", , "条股票信息")
爬虫
import scrapy
import re
import urllib.request
from bs4 import UnicodeDammit
from bs4 import BeautifulSoup
from ..items import EaststockItem
class StockSpider():
name = 'stock'
page = 1
# allowed_domains = ['']
start_urls = ['']
def parse(self, response):
while <= 100:
url = ""%()
headers = {
"User-Agent": "Mozilla/ (Windows NT 10.0; WOW64) AppleWebKit/ (KHTML, like Gecko) ""Chrome/70.0.3538.25 Safari/ Core/1.70.3775.400 QQBrowser/10.6.4209.400"}
req = (url, headers=headers)
data = (req)
data = ()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, '')
data = (r'"diff":\[(.*?)]', )
datas = data[0].strip("{").strip("}").split('},{') # 去掉头尾的"{"和"}",再通过"},{"切片
for i in range(len(datas)):
item = EaststockItem()
line = datas[i].replace('"', "").split(",") # 去掉双引号并通过","切片
item["f12"] = line[6][4:]
item["f14"] = line[7][4:]
item["f2"] = line[0][3:]
item["f3"] = line[1][3:]
item["f4"] = line[2][3:]
item["f5"] = line[3][3:]
item["f6"] = line[4][3:]
item["f7"] = line[5][3:]
item["f15"] = line[8][4:]
item["f16"] = line[9][4:]
item["f17"] = line[10][4:]
item["f18"] = line[11][4:]
yield item
+= 1
settings和run
ITEM_PIPELINES = {
'': 300,
}
from scrapy import cmdline
cmdline.execute("scrapy crawl stock -s LOG_ENABLED=false".split())
运行结果
作业③
代码
items
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BankItem():
# define the fields for your item here like:
# name = ()
Currency = ()
TSP = ()
CSP = ()
TBP = ()
CBP = ()
bTime = ()
pipeline
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class BankPipeline:
def open_spider(self, spider):
print('opened')
try:
self.con = (host='localhost',port=3306,user='root',passwd='383186',db='bank_db',charset='utf8',autocommit =True)
self.cursor = self.con.cursor()
('delete from foreignexchange')
self.opened = True
= 0
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
()
self.opened = False
print('closed')
print('总共爬取', , '种外汇')
def process_item(self, item, spider):
try:
print(type(item["Currency"]))
print(type(item["TSP"]))
print(type(item["CSP"]))
print(type(item["TBP"]))
print(type(item["CBP"]))
print(type(item["bTime"]))
print()
if self.opened:
("insert into foreignexchange (bCurrency,bTSP,bCSP,bTBP,bCBP,bTime) value (%s,%s,%s,%s,%s,%s)",(item['Currency'],item['TSP'],item['CSP'],item['TBP'],item['CBP'],item['bTime']))
+= 1
except Exception as err:
print(err)
return item
settings和run
ITEM_PIPELINES = {
'': 300,
}
from scrapy import cmdline
cmdline.execute("scrapy crawl bank -s LOG_ENABLED=False".split())
爬虫
import scrapy
from ..items import BankItem
from scrapy.selector import Selector
class BankSpider():
name = 'bank'
start_urls = ['']
def parse(self, response):
try:
data = .decode()
selector = Selector(text=data)
table = ("//table[@class='data']/tr")
for t in table[1:]:
currency = ("./td[position()=1]/text()").extract_first()
tsp = ("./td[position()=4]/text()").extract_first()
csp = ("./td[position()=5]/text()").extract_first()
tbp = ("./td[position()=6]/text()").extract_first()
cbp = ("./td[position()=7]/text()").extract_first()
time = ("./td[position()=8]/text()").extract_first()
item = BankItem()
item["Currency"] = () if currency else ""
item["TSP"] = () if tsp else ""
item["CSP"] = () if csp else ""
item["TBP"] = () if tbp else ""
item["CBP"] = () if cbp else ""
item["bTime"] = () if time else ""
yield item
except Exception as err:
print(err)
数据库创建
结果
问题与解决(心得体会)
(1)程序执行时报错(1054, Unknown column bDetail in field list)
- 该列名在数据表中不存在,也就是SQL语句中的列名写错了。
- 数据表中的列名多了一个空格,解决办法就是将空格去掉就可以了。(我是这一种)
- 该列属于后面加上去的,选择全部列可以,单独获取报错,解决办法就是将数据表中的该列重新命名,然后保存。
(2)错误代码(1064,...)SQL语法错误
- 错误可能有很多( 我的错误:("insert into eastmoney ...)中列用“,”分隔,而不是空格)
(3)Python连接mysql,插入数据时不报错,但是没有插入进去
- 在 connect() 中加上参数 autocommit 并且给它赋值 True 这样就能自动检查是否真的入库了
con=(host=host_db,user=user_db,passwd=pwssword,db=name_db,port=port_db,charset='utf8',cursorclass=,autocommit =True)
(4)(1045,“Access denied for user 'root' @'localhost' (using password: YES)")
(5)text数据类型定义
- 不能只使用text(),括号需要加数字
- TINYTEXT: 256 bytes
- TEXT: 65,535 bytes => ~64kb
- MEDIUMTEXT: 16,777,215 bytes => ~16MB
- LONGTEXT: 4,294,967,295 bytes => ~4GB
- MySQL Workbench里只有TINYTEXT和TEXT(),可用TEXT()自定义大小
posted @
2020-11-03 21:43 冰凉小手阅读(
6) 评论(
)
编辑 收藏