#!/usr/bin/python
#coding:utf-8
import json
import urllib2
import re
from bs4 import BeautifulSoup
import MySQLdb
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def ConnectMysql(book_name,imglist,writer,info,url):
print book_name,imglist,writer,info,url
try:
myconnet=MySQLdb.connect("localhost","root","","db_books",charset="utf8")
except MySQLdb.OperationalError,message:
print "数据库连接失败"
mycursor=myconnet.cursor()
sql="insert into book_info values('%s','%s','%s','%s','%s')"%(book_name,imglist,writer,info,url)
mycursor.execute(sql)
myconnet.commit()
mycursor.close()
myconnet.close()
def OpenPage(page):
Myheader={}
#urllib2.Request 第一个是待爬的url,第二个是我们的请求头headers
request=urllib2.Request(page,headers=Myheader)
#urlopen发送请求指定请求
f=urllib2.urlopen(request)
#将对象f 使用read读取相应的内容
data=f.read()
return data.decode("GBK",errors="ignore").encode("utf-8")
#解析指定页面内容
def JiexiPage(data):
soup=BeautifulSoup(data,"html.parser")
list_ebook=soup.find_all(href=re.compile("thread-"))
url_list=[]
for item in list_ebook:
url_list.append("http://www.51dupdf.com/"+item['href'])
url_list=list(set(url_list))
return url_list
#深剖析每一个页面内容
def EachPageJx(url):
pagedata=OpenPage(url)
soup=BeautifulSoup(pagedata,"html.parser")
adress=r'<img src="([^"]+\.jpg)'
imglist = re.findall(adress, pagedata)
imgaddr="http://www.51dupdf.com/"+imglist[1]
book_name=soup.find_all('a',class_=re.compile("vt_title"))[0].get_text()
writer=soup.find('tbody').find_all('tr')[0].get_text()
writer=writer.split(" ")
edtion=writer[2]
writer=writer[5]
writer=writer.split("\n")
writer=writer[0]
info=soup.find_all('td',class_=re.compile("t_f"))[0].get_text()
info=info.encode("utf-8")
info="详细信息请打开下面的链接"
ConnectMysql(book_name,imgaddr,writer,info,url)
import math
if __name__=="__main__":
page_url_list=[]
for item in range(1,15):
page_url_list.append("http://www.51dupdf.com/forum.php?mod=forumdisplay&fid=45&typeid=21&sortid=2&typeid=21&sortid=2&filter=typeid&page="+"%d"%item)
for page_list in page_url_list:
print page_list
page_data=OpenPage(page_list)
url_list=JiexiPage(page_data)
for item in url_list:
EachPageJx(item)
Python爬虫小项目:爬一个图书网站
最新推荐文章于 2024-08-04 14:49:45 发布