上午开始调python 与mysql的连接,还是没搞出来。
下午导师开会:要 建立个事务库,要写自己负责的这部分的系统规划书,要开始做后面部分,主要的是事务分析。
然后,去找小周讨论,啊哈哈,小周拯救了姐啊。终于连上数据库。我也帮她解决了一个问题。啊哈,相互交流还是有进步的。
解决方法:
把已有的mysql数据库从控制面板中卸载干净
下载 app-serv-win32-2.5.10.exe,下载直接安装就好。
浏览器输入127.0.0.1,简单的进行配置,字符集 utf8.在线的一个数据库。高大上。
然后建数据库test,简单。用sql语句,建表books。
SQL:
CREATE TABLE `books` (`id` int(11) NOT NULL AUTO_INCREMENT,
`headTitle` text NOT NULL,
`description` text DEFAULT NULL,
`url` varchar(500) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=250 DEFAULT CHARSET=utf8;
然后 scrapy startprojects doubanmoive
把小周发过来的代码,粘贴入各个文件。
小周代码如下:
item.py:
import scrapy
from scrapy.item import Item, Field
class DoubanmoiveItem(Item):
name=Field(serializer=str)
year=Field(serializer=str)
score=Field(serializer=str)
director=Field(serializer=str)
classification=Field(serializer=str)
actor=Field(serializer=str)
pass
moive_spider.py:
# -*- coding: utf-8 -*-
import time
import re
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from doubanmoive.items import DoubanmoiveItem
class MoiveSpider(CrawlSpider):
name="doubanmoive"
allowed_domains=["movie.douban.com"]
start_urls=["http://movie.douban.com/top250"]
rules=[
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),callback="parse_item"),
]
def parse_item(self,response):
sel=Selector(response)
item=DoubanmoiveItem()
item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
item['director']=sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract()
item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract()
item['actor']= sel.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract()
return item
pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
import MySQLdb
import MySQLdb.cursors
class DoubanmoivePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db = 'test',
user = 'root',
passwd = '123',
cursorclass = MySQLdb.cursors.DictCursor,
charset = 'utf8',
use_unicode = False
)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self,tx,item):
tx.execute("select * from doubanmoive where m_name= %s",(item['name'][0],))
result=tx.fetchone()
log.msg(result,level=log.DEBUG)
print result
if result:
log.msg("Item already stored in db:%s" % item,level=log.DEBUG)
else:
classification=actor=''
lenClassification=len(item['classification'])
lenActor=len(item['actor'])
for n in xrange(lenClassification):
classification+=item['classification'][n]
if n<lenClassification-1:
classification+='/'
for n in xrange(lenActor):
actor+=item['actor'][n]
if n<lenActor-1:
actor+='/'
tx.execute(\
"insert into doubanmoive (m_name,m_year,m_score,m_director,m_classification,m_actor) values (%s,%s,%s,%s,%s,%s)",\
(item['name'][0],item['year'][0],item['score'][0],item['director'][0],classification,actor))
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
setting.py:
# -*- coding: utf-8 -*-
# Scrapy settings for doubanmoive project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'doubanmoive'
SPIDER_MODULES = ['doubanmoive.spiders']
NEWSPIDER_MODULE = 'doubanmoive.spiders'
ITEM_PIPELINES={
'doubanmoive.pipelines.DoubanmoivePipeline'
}
LOG_LEVEL='DEBUG'
DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True
MySQL_SERVER = 'localhost'
MySQL_PORT = 3306
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'doubanmoive (+http://www.yourdomain.com)'
重要的是借鉴她的 pipelines.py
更改过后,我的:
item.py:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item, Field
class XinwenItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
headTitle=Field(serializer=str)
description=Field(serializer=str)
url=Field(serializer=str)
pass
pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
import MySQLdb
import MySQLdb.cursors
class XinwenPipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db = 'test',
user = 'root',
passwd = '123',
cursorclass = MySQLdb.cursors.DictCursor,
charset = 'utf8',
use_unicode = False
)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self,tx,item):
str_des=item['description']
if len(str_des):
tx.execute("select * from books where headTitle= %s",(item['headTitle'][0],))
result=tx.fetchone()
log.msg(result,level=log.DEBUG)
print result
if result:
log.msg("Item already stored in db:%s" % item,level=log.DEBUG)
else:
tx.execute(\
"insert into books(headTitle,description,url) values (%s,%s,%s)",\
(item['headTitle'][0],item['description'][0],item['url']))
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
else:
pass
def handle_error(self, e):
log.err(e)
#sys.stdout=open('output_pipe3.txt','w') #将打印信息输出在相应的位置下 重要的一行代码
xinwen_spider.py:
# -*- coding: utf-8 -*-
import codecs
import json
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
import sys
import string
from lxml import etree
from sgmllib import SGMLParser
#reload(sys)
#sys.setdefaultencoding('utf8')
import time
import re
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from xinwen.items import XinwenItem
sys.stdout=open('output_body_ok222.txt','w') #将打印信息输出在相应的位置下 重要的一行代码
class GetIdList(SGMLParser):
def reset(self):
self.IDlist = []
self.flag = False
self.getdata = False
self.verbatim = 0
SGMLParser.reset(self)
# print "hi"
def start_div(self, attrs):
if self.flag == True:
self.verbatim +=1 #进入子层div了,层数加1
#print "div"
return
for k,v in attrs:#遍历div的所有属性以及其值
if k == 'id' and v == 'Cnt-Main-Article-QQ':#确定进入了<div class='entry-content'>
self.flag = True
# print "jinru div"
return
def end_div(self):#遇到</div>
if self.verbatim == 0:
self.flag = False
if self.flag == True:#退出子层div了,层数减1
self.verbatim -=1
def start_p(self, attrs):
#print "p"
if self.flag == False:
return
self.getdata = True
def end_p(self):#遇到</p>
if self.getdata:
self.getdata = False
def handle_data(self, text):#处理文本
#print "nihao"
if self.getdata:
self.IDlist.append(text)
# print "zhende"
def printID(self):
#print "text"
for i in self.IDlist:
# print "weishenme"
print i
class GetIdList_163(SGMLParser):
def reset(self):
self.IDlist = []
self.flag = False
self.getdata = False
self.verbatim = 0
SGMLParser.reset(self)
# print "hi"
def start_div(self, attrs):
if self.flag == True:
self.verbatim +=1 #进入子层div了,层数加1
#print "div"
return
for k,v in attrs:#遍历div的所有属性以及其值
if k == 'id' and v == 'endtext':#确定进入了<div class='entry-content'>
self.flag = True
# print "jinru div"
return
def end_div(self):#遇到</div>
if self.verbatim == 0:
self.flag = False
if self.flag == True:#退出子层div了,层数减1
self.verbatim -=1
def start_p(self, attrs):
#print "p"
if self.flag == False:
return
self.getdata = True
def end_p(self):#遇到</p>
if self.getdata:
self.getdata = False
def handle_data(self, text):#处理文本
#print "nihao"
if self.getdata:
self.IDlist.append(text)
# print "zhende"
def printID(self):
#print "text"
for i in self.IDlist:
# print "weishenme"
print i
add = 0
class Xinwen_spider(CrawlSpider):
name="huhu"
allowed_domains=["digi.tech.qq.com","digi.163.com"]
start_urls=["http://digi.163.com/nb/",
"http://digi.tech.qq.com/mobile/",
"http://digi.tech.qq.com/clear_article_qq/tag_article_list.htm?tags=中兴"]
rules=[
Rule(SgmlLinkExtractor(allow=('15/')), callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('a/2015')), callback='parse_item'),
# 提取匹配 'huhuuu/p/' 的链接并使用spider的parse_item方法进行分析
Rule(SgmlLinkExtractor(allow=('a/2014' )), callback='parse_item')
]
def parse_item(self,response):
global add #用于统计博文的数量
print add
add+=1
sel = Selector(response)
items = []
item = XinwenItem()
item['url']=response.url
#html = sel.xpath("/html/body")
#item['content'] = html.xpath(".//p")
#item['content']=response.body
sel2 = response.body #重要代码。
# sel2=response.read()
lister = GetIdList()
lister.feed(sel2)
#print 'bb'
lister.printID()
lister_163 = GetIdList_163()
lister_163.feed(sel2)
#print 'bb'
lister_163.printID()
#sel3=sel2.encode('gb2312')
# print sel2
#ss_con=sel2.encode('gb2312')
#item['content']=sel.xpath('//div/p/text()').extract()
#ss_con=item['content']
#if len(ss_con):
#ss_con2= item['content'][0]
#ss_con3=ss_con2.encode('utf8') #重要的一行代码
# print ss_con3
# else:
# pass
item['headTitle'] = sel.xpath('/html/head/title/text()').extract()#观察网页对应得html源码
#item['url'] = response
item['description']=sel.xpath('//div [@id="Cnt-Main-Article-QQ"]/p/text()').extract()
for data in item['headTitle']:
strTmp =''
for i in data:
if (i != u'\u2014'): #'\xa0'这个字符编译器不能识别,所以处理掉
#print i
strTmp += i
strTmp=strTmp.encode('gb2312')
print strTmp
ss= item['headTitle'][0] #重要的一行代码
#ssurl=item['url']
#print"hihi"
ss=ss.encode('utf8') #重要的一行代码
print ss
str_des=item['description']
if len(str_des):
ss_content= item['description'][0]
#for data in item['description']:
#ss_content2 =''
#for i in data:
#if (i != u'\xa0'): #'\xa0'这个字符编译器不能识别,所以处理掉
#print i
#ss_content2 += i
ss_content=ss_content.encode('utf8') #重要的一行代码
print ss_content
else:
pass
#重要的一行代码
#ssurl=item['url']
#print"hihi"
#print json.dumps(ss,encoding='gb2312',ensure_ascii=False)
#ss=ss.decode("unicode_escape")
#print ss
#print ss_content
print item['url']
# print 'hi'
items.append(item)
return items
settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for xinwen project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'xinwen'
SPIDER_MODULES = ['xinwen.spiders']
NEWSPIDER_MODULE = 'xinwen.spiders'
ITEM_PIPELINES={
'xinwen.pipelines.XinwenPipeline'
}
LOG_LEVEL='DEBUG'
DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True
MySQL_SERVER = 'localhost'
MySQL_PORT = 3306
框架用起来确实很爽,不用写太多代码,修修改改,运气好的话,就可以跑了,也不知道内部机制是怎么处理的。
不过,一旦遇到bug,就要耗很久去调试,去找别人的代码,和解决方案。
比如,这里的小周的 pipelines.py代码就很适合我这个程序。唉,我调了两天的连接问题,也没觉得学到啥,今天,不过是找到了合适的代码而已。
不过,现在看别人的scrapy的博客,越看越有兴趣了,因为开始都能隐隐约约的看懂了。
不过现在要暂时转战到 事务分析。