main方法
# -*- coding:utf-8 -*-
__author__ = 'zhaochaoye'
import utlis
import urllib
import MySQLdb
import jieba
if __name__ == "__main__":
# generate start url list
start_urls = []
for i in range(100, 3750):
url = "http://zxyxpt.suda.edu.cn/Detail.aspx?id="+str(i)
start_urls.append(url)
newsSpider = utlis.NewsSpider(start_urls)
newsSpider.parse()
host, user, pwd, db = "localhost", "root", "root", "databaseA"
conn = MySQLdb.connect(host, user, pwd, db, charset='utf8')
# 获取cursor对象来进行操作
cursor = conn.cursor()
# 设置数据库编码格式
cursor.execute("SET NAMES utf8")
cursor.execute("SET CHARACTER_SET_CLIENT=utf8")
cursor.execute("SET CHARACTER_SET_RESULTS=utf8")
for record in newsSpider.records:
print record
#编写sql语句
sql = "INSERT INTO yixiao VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (record.ID, record.category, record.state,record.time,record.TitleLB,record.ContentLB,record.ReplyLB,record. Retime)
#执行sql语句
try:
cursor.execute(sql)
conn.commit()
except:
pass
# 关闭数据库
cursor.close()
conn.close()
utlis:
# -*- coding:utf-8 -*-
__author__ = 'zhaochaoye'
from bs4 import BeautifulSoup
import urllib
import re
import time
import random
class Record(object):
def __init__(self, ID, category,state,time,TitleLB,ContentLB,ReplyLB,Retime):
self.ID = ID
self.category = category
self.state = state
self.time = time
self.TitleLB = TitleLB
self.ContentLB = ContentLB
self.ReplyLB =ReplyLB
self.Retime = Retime
def __str__(self):
return str(self.ID)+","+str(self.category.encode("gbk"))+","+str(self.state.encode("gbk"))+","+str(self.time.encode("gbk"))+","+str(self.TitleLB.encode("gbk"))+","+str(self.ContentLB.encode("gbk"))+","+str(self.ReplyLB.encode("gbk"))+","+str(self.Retime.encode("gbk"))
def __repr__(self):
return str(self.ID)+","+str(self.category.encode("gbk"))+","+str(self.state.encode("gbk"))+","+str(self.time.encode("gbk"))+","+str(self.TitleLB.encode("gbk"))+","+str(self.ContentLB.encode("gbk"))+","+str(self.ReplyLB.encode("gbk"))+","+str(self.Retime.encode("gbk"))
class UrlParser(object):
# 通过urllib访问页面,获取返回状态、头信息及页面内容
def __init__(self, url):
self.url = url
self.head_info = urllib.urlopen(url).info()
self.status = urllib.urlopen(url).getcode()
self.content = urllib.urlopen(url).read()
# 页面URL抽取器,从页面中抽取[编号/议案类型/状态/提交时间/标题/内容/会办意见/回复时间]
def url_extractor(self):
# extract ID
ID = self.extract_ID()
# extract category
category = self.extract_category()
state = self.extract_state()
time = self.extract_time()
TitleLB = self.extract_TitleLB()
ContentLB =self.extract_ContentLB()
ReplyLB = self.extract_ReplyLB()
Retime = self.extract_Retime()
return Record(ID, category,state,time,TitleLB,ContentLB,ReplyLB,Retime)
def extract_ID(self):
soup = BeautifulSoup(self.content)
id_span = soup.find("span", id="idlb")
return id_span.get_text()
def extract_category(self):
soup = BeautifulSoup(self.content)
id_span = soup.find("span", id="list_idLB")
return id_span.get_text()
def extract_state(self):
soup = BeautifulSoup(self.content)
id_span =soup.find(id="stateLB")
return id_span.get_text()
def extract_time(self):
soup = BeautifulSoup(self.content)
id_span =soup.find(id="TimeLB")
return id_span.get_text()
def extract_TitleLB(self):
soup = BeautifulSoup(self.content)
id_span =soup.find(id="TitleLB")
return id_span.get_text()
def extract_ContentLB(self):
soup = BeautifulSoup(self.content)
id_span =soup.find(id="ContentLB")
return id_span.get_text()
def extract_ReplyLB(self):
soup = BeautifulSoup(self.content)
id_span =soup.find(id="ReplyLB")
return id_span.get_text()
def extract_Retime(self):
soup = BeautifulSoup(self.content)
id_span =soup.find(id="RetimeLB")
return id_span.get_text()
class NewsSpider(object):
# 初始化需要采集的游记列表页面URL
def __init__(self, crawl_urls):
self.crawl_urls = crawl_urls
self.records = []
# 解析页面获取游记页面URL
def parse(self):
print "crawling travel urls"
cout = 0
for url in self.crawl_urls:
url_parser = UrlParser(url)
self.records.append(url_parser.url_extractor())
time.sleep(random.random()/10)
cout += 1
print cout, url