python在近两年火了起来,这是因为人工智能越来越受到人们的欢迎,在网络应用中爬虫程序成为了很多企业需要使用的程序。而python用来写爬虫真的很简单。只需要用到几个库,很短的一些代码能够完成一个简易的爬虫程序。
比如:我们希望爬去一个家具板网站的内容,那么就可以了写一个爬虫程序,代码如下:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import MySQLdb
import time
import random
class spider:
def __init__(self, url='', dbhost='locahost', dbuser='root', dbpwd='root', charset='utf8', curUrlHost='', curUrlPath='', row=5000):
self.url = url
self.dbhost = dbhost
self.curUrlHost = ''
self.curUrlPath = ''
self.row = 5000
def catchWeb(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
}
proxies={"http":"http://120.76.77.152:9999"}
r = requests.post(self.url, headers=headers, proxies=proxies)
#print(r.request.headers)
webcode=requests.get(self.url)
webcode.encoding = 'utf-8'
self.text = webcode.text
if len(webcode.text) == 0:
return False
else:
return True
def findSitePath(self):#解析出网址和当前目录
res = urlparse(self.url)
self.curUrlHost = res.scheme + '://' + res.netloc
self.curUrlPath = self.curUrlHost
path = res.path
if len(path) <= 1:
curUrlPath = ''
self.curUrlPath = ''
return True
if len(path) < 5:
self.curUrlPath = path
return True
pos = path.rfind('.')
if pos == -1 and len(path) >= 5:
self.curUrlPath = path
return True
else:
pos = path.rfind('/') + 1
self.curUrlPath = path[:pos]
return True
#print(self.curUrlHost)
#print(self.curUrlPath)
#return False
def getaUrlContent(self):
conn = MySQLdb.connect('localhost', user='root', passwd='root', db='contents', charset='utf8')
# 使用cursor()方法获取操作游标
cursor = conn.cursor()
#self.catchWeb()
Bs = BeautifulSoup(self.text, 'html.parser')
aStr = Bs.find_all('a', href=True)
#path =
#mf = open('link.txt', 'w+')
for link in aStr:
aUrl = link.get('href')
if aUrl == '/':
continue
if aUrl.find('javascript:') != -1:
continue
if aUrl[0] == '/' and aUrl != '/':
aUrl = self.curUrlHost + aUrl
if aUrl[0:1] == './':
aUrl = self.curUrlHost + aUrl[1:]
if aUrl[0:2] == '../':
mPath=self.curUrlPath
xpos = mPath.rfind('/')+1
aUrl = self.curUrlHost + mPath[:xpos]
#print(aUrl + '链接地址已经存在')
print(self.curUrlHost)
print("----------" + aUrl)
sql = "select * from catlink where url ='" + aUrl + "'"
cursor.execute(sql)
if cursor.rowcount < 1:
sql = "insert into catlink(url,catched,cattime) values('" + aUrl + "',0," + str(int(time.time())) + ")"
cursor.execute(sql)
conn.commit()
#解析内容
#if(aUrl[])
htmlTitle = Bs.find('title')
Bs.body.encode('utf-8')
bodyContent =Bs.find('body')
[s.extract() for s in bodyContent('script')]
[s.extract() for s in bodyContent('a')]
[s.extract() for s in bodyContent('img')]
[s.extract() for s in bodyContent('ul')]
bodyText = bodyContent.text
bodyText =bodyText.replace('\n', '').replace('\r', '').replace('\r\n', '')
#keywordText = Bs.find(meta)
#print('文章标题:' + htmlTitle.text)
#print('关键词:' + keywordText['content'])
#print('文章内容:' + bodyText)
sql = "insert into duanluo(title,content,used) values('" + htmlTitle.text + "','" + bodyText + "',0)"
cursor.execute(sql)
conn.commit()
sql = "update catlink set catched=1 where url='" + self.url + "'"
cursor.execute(sql)
conn.commit()
#mf.close()
conn.close()
def tagClear(self):
for mTag in jihe:
[s.extract() for s in jiedian(mTag)]
def setUrl(self):
conn = MySQLdb.connect('localhost', user='root', passwd='root', db='contents', charset='utf8')
sql = 'select * from catlink where catched=0 limit 1'
cursor = conn.cursor()
cursor.execute(sql)
row = cursor.fetchone()
self.url = row[1]
def start(self):
if self.url != '':
self.catchWeb()
self.findSitePath()
self.getaUrlContent()
while True:
self.setUrl()
self.catchWeb()
self.findSitePath()
self.getaUrlContent()
url = 'http://www.woodmachine.ltd/'
r = spider(url)
r.start()