作业一
爬取当当网站图书数据
items.py
import scrapy
class BookItem():
title =()
author = ()
date=()
publisher=()
detail=()
price=()
MySpider.py
import scrapy
from ..items import BookItem
from bs4 import UnicodeDammit
class MySpider():
name = "mySpider"
key = 'python'
source_url=''
def start_requests(self):
url = "?key="+MySpider.key
yield scrapy.Request(url=url,callback=)
def parse(self, response):
try:
dammit = UnicodeDammit(, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector=scrapy.Selector(text=data)
lis=("//li['@ddt-pit'][starts-with(@class,'line')]")
for li in lis:
title=("./a[position()=1]/@title").extract_first()
price=("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
author = ("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
date =("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
publisher = ("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
detail = ("./p[@class='detail']/text()").extract_first()
item=BookItem()
item["title"]=() if title else ""
item["author"]=() if author else ""
item["date"] = ()[1:] if date else ""
item["publisher"] = () if publisher else ""
item["price"] = () if price else ""
item["detail"] = () if detail else ""
yield item
link = ("//div[@class='paging']/ul[@name='Fy']/li[@class='next'] / a / @ href").extract_first()
if link:
url = (link)
yield scrapy.Request(url=url, callback=)
except Exception as err:
print(err)
pipelines.py
from itemadapter import ItemAdapter
import pymysql
class BookPipeline:
def open_spider(self, spider):
print("opened")
try:
self.con = pymysql.connect(host="", port=3306, user="root", passwd="axx123123", db="mydb",
charset="utf8")
self.cursor = self.con.cursor()
("delete from money")
self.opened = True
self.count = 0
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
self.con.commit()
()
self.opened = False
print("closed")
def process_item(self, item, spider):
try:
print(item["title"])
print(item["author"])
print(item["publisher"])
print(item["date"])
print(item["price"])
print(item["detail"])
print()
if self.opened:
("insert into money(Id,bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) "
"values (%s,%s,%s,%s,%s,%s,%s)",
(["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
self.count += 1
except Exception as err:
print(err)
return item
运行截图
心得体会
sql语言也有课程在学,所以sql的部分代码也还算不错。
可以在pycharm里面直接查找数据库,但是如果语言报错,只会提示在哪里附近错了,不会提示报错的具体类型,眼神不好使的话就很难过了
作业二
Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息
items.py
import scrapy
class ShareItem():
id=()
shareNumber=()
shareName=()
newestPrice=()
changeRate=()
changePrice=()
turnover=()
turnoverPrice=()
amplitude=()
highest=()
lowest=()
today=()
yesterday=()
pass
MySpider.py
import scrapy
from selenium import webdriver
from ..items import ShareItem
class MySpider():
name = 'share'
def start_requests(self):
url = ''
yield scrapy.Request(url=url, callback=)
def parse(self, response):
driver = ()
try:
("")
list=("//table[@id='table_wrapper-table'][@class='table_wrapper-table']/tbody/tr")
for li in list:
id=("./td[position()=1]")[0].text
shareNumber=("./td[position()=2]/a")[0].text
shareName=("./td[position()=3]/a")[0].text
newestPrice=("./td[position()=5]/span")[0].text
changeRate=("./td[position()=6]/span")[0].text
changePrice =("./td[position()=7]/span")[0].text
turnover =("./td[position()=8]")[0].text
turnoverPrice =("./td[position()=9]")[0].text
amplitude =("./td[position()=10]")[0].text
highest =("./td[position()=11]/span")[0].text
lowest =("./td[position()=12]/span")[0].text
today =("./td[position()=13]/span")[0].text
yesterday =("./td[position()=14]")[0].text
print("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
%(id,shareNumber,shareName,newestPrice,changeRate,changePrice,
turnover,turnoverPrice,amplitude,highest,lowest,today,yesterday))
item=ShareItem()
item["id"]=id
item["shareNumber"]=shareNumber
item["shareName"]=shareName
item["newestPrice"]=newestPrice
item["changeRate"]=changeRate
item["changePrice"]=changePrice
item["turnover"]=turnover
item["turnoverPrice"]=turnoverPrice
item["amplitude"]=amplitude
item["highest"]=highest
item["lowest"]=lowest
item["today"]=today
item["yesterday"]=yesterday
yield item
except Exception as err:
print(err)
pipelines.py
import pymysql
class SharePipeline:
def open_spider(self, spider):
print("opened")
try:
self.con = pymysql.connect(host="", port=3306, user="root", passwd="axx123123", db="mydb",
charset="utf8")
self.cursor = self.con.cursor()
("delete from share")
self.opened = True
self.count = 0
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
self.con.commit()
()
self.opened = False
print("closed")
def process_item(self, item, spider):
try:
if self.opened:
(
"insert into share(Sid,Snumber,Sname,SnewestPrice,SchangeRate,SchangePrice,"
"Sturnover,SturnoverPrice,Samplitude,Shighest,Slowest,Stoday,Syesterday)"
"values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(item["id"],item["shareNumber"],item["shareName"],item["newestPrice"],item["changeRate"],item["changePrice"],
item["turnover"],item["turnoverPrice"],item["amplitude"],item["highest"],item["lowest"],item["today"],item["yesterday"]))
except Exception as err:
print(err)
return item
运行截图
心得体会
算是第一次接触selenium,比较麻烦的还得用firefox,组装的过程比较麻烦(所以数据表的创建就很笼统的全部是字符串类型),内容和上次实验一样,但是寻找数据方面上花的时间少了许多,selenium+xpath寻找到对应数据的速度比上次快,而且处理上面也省了许多事(firefox自带查找 太香了,孩子很开心 下次还用)
作业三
使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
items.py
import scrapy
class MoneyItem():
id=()
currency=()
tsp=()
csp=()
tbp=()
cbp=()
time=()
pass
MySpider.py
import scrapy
from ..items import MoneyItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
class MySpider():
name = "mySpider"
source_url=''
def start_requests(self):
url = MySpider.source_url
yield scrapy.Request(url=url,callback=)
def parse(self, response):
try:
dammit = UnicodeDammit(, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector=scrapy.Selector(text=data)
count=1
list=("//table[@cellspacing='1']/tr")
idn=1
for li in list:
if count!=1:
id=idn
currency=("./td[@class='fontbold']/text()").extract_first().strip()
tsp=("./td[@class='numberright'][position()=1]/text()").extract_first().strip()
csp=("./td[@class='numberright'][position()=2]/text()").extract_first().strip()
tbp=("./td[@class='numberright'][position()=3]/text()").extract_first().strip()
cbp=("./td[@class='numberright'][position()=4]/text()").extract_first().strip()
time=("./td[@align='center'][position()=3]/text()").extract_first().strip()
item=MoneyItem()
item["id"] = id if id else ""
item["currency"] = () if currency else ""
item["tsp"] = tsp if tsp else ""
item["csp"] = csp if csp else ""
item["tbp"] = tbp if tbp else ""
item["cbp"] = cbp if cbp else ""
item["time"] = time if time else ""
yield item
idn=idn+1
count=count+1
except Exception as err:
print(err)
pipelines.py
from itemadapter import ItemAdapter
import pymysql
class MoneyPipeline:
def open_spider(self, spider):
print("opened")
try:
self.con = pymysql.connect(host="", port=3306, user="root", passwd="axx123123", db="mydb",
charset="utf8")
self.cursor = self.con.cursor()
("delete from money")
self.opened = True
self.count = 0
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
self.con.commit()
()
self.opened = False
print("closed")
def process_item(self, item, spider):
try:
if self.opened:
("insert into money(Mid,Mcurrency,Mtsp,Mcsp,Mtbp,Mcbp,Mtime) "
"values (%s,%s,%s,%s,%s,%s,%s)",
(item["id"], item["currency"], item["tsp"], item["csp"], item["tbp"], item["tsp"],item["time"]))
except Exception as err:
print(err)
return item
运行截图
心得体会
在前两个实验的基础下,加上外汇网站本身的html结构不是很复杂,多用tag[condition]的方法很快就能找到对应数据的xpath表达式