python出入库_python爬取本站电子书信息并入库的实现代码

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py

import pymysql

class DBUtils(object):

def connDB(self): #连接数据库

conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');

cur=conn.cursor();

return (conn,cur);

def exeUpdate(self,conn,cur,sql): #更新或插入操作

sta=cur.execute(sql);

conn.commit();

return (sta);

def exeDelete(self,conn,cur,IDs): #删除操作 demo 没用到

sta=0;

for eachID in IDs.split(' '):

sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));

conn.commit();

return (sta);

def exeQuery(self,cur,sql): #查找操作

effect_row = cur.execute(sql);

return (effect_row,cur);

def connClose(self,conn,cur): #关闭连接,释放资源

cur.close();

conn.close();

if __name__ == '__main__':

dbUtil = DBUtils();

conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py

from DBUtil import DBUtils

from bookInfo import Book

from bookInfo import DownLoadInfo

import logging

logging.basicConfig(

level=logging.INFO

)

class BookOperator(object):

def __addBook(self,book):

logging.info("add book:%s" % book.bookName);

dbUtil = DBUtils();

conn,cur = dbUtil.connDB();

insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));

dbUtil.exeUpdate(conn,cur,insertBookSql);

dbUtil.connClose(conn,cur);

def __selectLastBookId(self):

logging.info("selectLastBookId ");

dbUtil = DBUtils();

conn,cur = dbUtil.connDB();

selectLastBookSql = "select id from book order by id desc limit 1";

effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);

bookId = cur.fetchone()[0];

dbUtil.connClose(conn,cur);

return bookId;

def __addBookDownLoadInfos(self,downLoadInfos,bookId):

logging.info("add bookId:%s" % bookId);

dbUtil = DBUtils();

conn,cur = dbUtil.connDB();

for downLoadinfo in downLoadInfos:

insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));

dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);

dbUtil.connClose(conn,cur);

def addBookInfo(self,book):

logging.info("add bookInfo:%s" % book.bookName);

self.__addBook(book);

bookId = self.__selectLastBookId();

self.__addBookDownLoadInfos(book.downLoadInfos,bookId);

if __name__ == '__main__':

bookope = BookOperator();

book = Book("aaa","yang","cccc");

book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));

bookope.addBookInfo(book);

书籍信息文件 bookInfo.py

import sys

sys.encoding = "utf8"

class Book(object):

#书籍信息#

def __init__(self,mainInfo,downLoadUrl,bookName):

self.mainInfo = mainInfo;

self.downLoadUrl = downLoadUrl;

self.bookName = bookName;

self.downLoadInfos = [];

def addDownLoadUrl(self,downloadInfo):

self.downLoadInfos.append(downloadInfo);

def print_book_info(self):

print ("bookName :%s" % (self.bookName));

class DownLoadInfo(object):

#下载信息#

def __init__(self,downUrl,downName):

self.downUrl = downUrl;

self.downName = downName;

def print_down_info(self):

print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py

import requests

from bs4 import BeautifulSoup

import sys

from bookInfo import Book

from bookInfo import DownLoadInfo

import logging

sys.encoding = "utf8"

class PageFetch(object):

host = "//www.jb51.net/"; #域名+分类

category = "books/"; #具体请求页

def __init__(self,pageUrl):

self.pageUrl = pageUrl; #完整URL

self.url = PageFetch.host+PageFetch.category + pageUrl;

def __getPageContent(self):

req = requests.get(self.url);

if req.status_code == 200:

req.encoding = "gb2312";

strText = req.text;

return strText;

else:

return "";

def getPageContent(url):

req = requests.get(url);

if req.status_code == 200:

req.encoding = "gb2312";

strText = req.text;

return strText;

else:

return "";

def __getMaxPageNumAndUrl(self):

fetchUrl = self.pageUrl;

#获取分页地址 分页url 形如 list45_2.html 2为页号#

maxPageNum = 0;

maxLink = "";

while maxLink == "":

url = PageFetch.host+PageFetch.category +fetchUrl;

reqContent = PageFetch.getPageContent(url)

soup = BeautifulSoup (reqContent,"html.parser");

for ul in soup.select(".plist"):

print ("数据");

print (ul);

maxPageNum = ul.select("strong")[0].text;

alink = ul.select("a");

if alink[-1]['href'] == "#":

maxLink = alink[1]['href'];

else:

fetchUrl = alink[-1]['href'];

return maxPageNum,maxLink;

def __formatPage(self,pageNum):

#格式化url 形如 list45_2.html#

lineBeginSite = self.pageUrl.index("_")+1;

docBeginSite = self.pageUrl.index(".");

return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];

def getBookPageList(self):

#获取书籍每页的URL#

shortPageList = [];

maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();

for i in range(int(maxPageNum)):

shortPageList.append(self.host +self.category+ self.__formatPage(i));

return shortPageList;

def getDownloadPage(url):

downPage= [];

reqContent = PageFetch.getPageContent(url);

soup = BeautifulSoup (reqContent,"html.parser");

for a in soup.select(".cur-cat-list .btn-dl"):

downPage.append(PageFetch.host+a['href']);

return downPage;

def getBookInfo(url):

logging.info("获取书籍信息url:%s" % url);

reqContent = PageFetch.getPageContent(url);

soup = BeautifulSoup (reqContent,"html.parser");

mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");

title = (soup.select("dl dt h1"))[0].text.replace("'","");

book = Book(mainInfo,url,title);

for ul in soup.select(".ul_Address"):

for li in ul.select("li"):

downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);

book.addDownLoadUrl(downLoadInfo);

return book;

if __name__ == '__main__':

p = PageFetch("list152_1.html");

shortPageList = p.getBookPageList();

downPage= [];

for page in shortPageList:

downLoadPage = PageFetch.getDownloadPage(page);

downPage = downPage+downLoadPage;

print ("================汇总如下===============================");

for bookDownLoadPage in downPage:

book = PageFetch.getBookInfo(bookDownLoadPage);

print (book.bookName+":%s" % book.downLoadUrl);

for d in book.downLoadInfos:

print ("%s - %s" % (d.downUrl,d.downName));

# p = PageFetch("list977_1.html");

# p = p.getMaxPageNumAndUrl();

# print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py

from FiveOneJobFetch import PageFetch

from bookInfo import Book

from bookInfo import DownLoadInfo

from bookOpe import BookOperator

def main(url):

p = PageFetch(url);

shortPageList = p.getBookPageList();

bookOperator = BookOperator();

downPage= [];

for page in shortPageList:

downLoadPage = PageFetch.getDownloadPage(page);

downPage = downPage+downLoadPage;

for bookDownLoadPage in downPage:

book = PageFetch.getBookInfo(bookDownLoadPage);

bookOperator.addBookInfo(book);

print ("数据抓取成功:"+url);

if __name__ == '__main__':

urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];

for url in urls:

main(url);

数据库表:书籍信息表和下载地址表

CREATE TABLE `book` (

`id` INT(11) NOT NULL AUTO_INCREMENT,

`bookName` VARCHAR(200) NULL DEFAULT NULL,

`bookUrl` VARCHAR(500) NULL DEFAULT NULL,

`bookInfo` TEXT NULL,

PRIMARY KEY (`id`)

)

COLLATE='utf8mb4_general_ci'

ENGINE=InnoDB

AUTO_INCREMENT=2936;

CREATE TABLE `book_down_url` (

`id` INT(11) NOT NULL AUTO_INCREMENT,

`bookId` INT(11) NOT NULL DEFAULT '0',

`downName` VARCHAR(200) NOT NULL DEFAULT '0',

`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',

PRIMARY KEY (`id`)

)

COLLATE='utf8mb4_general_ci'

ENGINE=InnoDB

AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master

Python出入库管理系统软件是一种用Python语言开发的软件,旨在帮助企业或组织进行库存管理、出入库记录等工作。它具有以下特点: 1. 简单易用:Python是一种简洁、易学的编程语言,因此开发的出入库管理系统软件使用起来非常简单,用户无需具备复杂的编程技能,即可轻松上手操作。 2. 高度定制化:出入库管理系统软件可以根据用户的需求进行定制开发,以满足不同企业或组织的具体业务流程。用户可以根据自身需求设置库存规则、商品分类、流程处理等,实现个性化定制。 3. 库存管理:出入库管理系统软件可以对企业或组织的库存进行全面管理,包括库存调整、库存盘点、库存预警等功能。用户可以随时掌握库存情况,避免库存过多或不足的情况发生。 4. 出入库记录:软件可以记录每一次的出入库操作,包括出库单、入库单、退货单等。用户可以通过软件了解每一次出库和入库的具体情况,方便查询和追溯。 5. 数据分析与报表生成:出入库管理系统软件可以对数据进行分析和统计,生成相应的报表,帮助用户更好地了解出入库情况和库存变化趋势,为企业决策提供参考依据。 总体来说,Python出入库管理系统软件是一种方便、高效的工具,可以帮助企业或组织管理好库存,提高运营效率,降低人力成本,从而实现更好的管理和发展。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值