抓取糗事百科的数据并存入数据库;
额, 看下代码应该就知道使用方法了;
#coding=utf-8
import requests;
import sqlite3;
from bs4 import BeautifulSoup as bs;
class QBSpider:
def __init__(self, connectedDb, tableName):
self.setDb(connectedDb);
self.setTableName(tableName);
self._createTable(tableName);
def setDb(self, db):
self._db = db;
def getDb(self):
return self._db;
def setTableName(self, tn):
self._tn = tn;
def getTableName(self):
return self._tn;
def _processElement(self, elements):
id = 0; #用来标识不同的内容
content = "";
date = "";
image = None;
url = "";
for element in elements:
eClass = " ".join(element["class"]);
if(eClass == "content"):
content = element.text.strip("\n");
date = element["title"];
elif(eClass == "stats clearfix"):
url = element.findChildren(name = "span", attrs = {"class":"stats-comments"})[0].a["href"];
elif(eClass == "stats-buttons bar clearfix"):
id = element["id"][14:];
elif(eClass == "thumb"):
image = element.a.img["src"]; #没玩过糗百, 不知道一次能发几张照片, 看到大多数人都只发一张, 所以, 这里只提取了一个url;
return id, content, image, date, url;
def fetch(self, url):
bsoup = bs(requests.get(url, headers = {
"Host":"www.qiushibaike.com",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36"
}).text);
root = bsoup.find(name = "div", attrs = {"id":"content-left"});
childElements = root.findChildren(name = "div", attrs = {"class":"article block untagged mb15"});
for childElement in childElements:
self.insert(self._processElement(childElement.findChildren(name = "div")));
def insert(self, args): #id, content, image, date, url
try:
self.getDb().cursor().execute("insert into {0}(id, content, image, date, url) values(?, ?, ?, ?, ?)".format(self.getTableName()), args);
self.getDb().commit();
except:
print("insert error");
self.getDb().rollback();
def _createTable(self, tableName):
try:
self.getDb().cursor().execute("create table {0} (id INTEGER NOT NULL PRIMARY KEY, content varchar(6144) NULL, image varchar(2048) NULL, date datetime NULL, url varchar(1024) NULL)".format(tableName));
self.getDb().commit();
except:
self.getDb().rollback();
if(__name__ == "__main__"): #test
db = sqlite3.connect("./test.db");
spider = QBSpider(db, "qbData");
spider.fetch("http://www.qiushibaike.com/hot");
抓取的数据: