python写爬虫用到那些内容

最新推荐文章于 2022-05-07 16:24:37 发布

lyghost

最新推荐文章于 2022-05-07 16:24:37 发布

阅读量270

点赞数

分类专栏： python 文章标签：爬虫

本文链接：https://blog.csdn.net/lyghost/article/details/103830817

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

python在近两年火了起来，这是因为人工智能越来越受到人们的欢迎，在网络应用中爬虫程序成为了很多企业需要使用的程序。而python用来写爬虫真的很简单。只需要用到几个库，很短的一些代码能够完成一个简易的爬虫程序。

比如：我们希望爬去一个家具板网站的内容,那么就可以了写一个爬虫程序，代码如下：

from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import MySQLdb
import time
import random

class spider:
def __init__(self, url='', dbhost='locahost', dbuser='root', dbpwd='root', charset='utf8', curUrlHost='', curUrlPath='', row=5000):
self.url = url
self.dbhost = dbhost
self.curUrlHost = ''
self.curUrlPath = ''
self.row = 5000

def catchWeb(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
}
proxies={"http":"http://120.76.77.152:9999"}
r = requests.post(self.url, headers=headers, proxies=proxies)
#print(r.request.headers)
webcode=requests.get(self.url)
webcode.encoding = 'utf-8'
self.text = webcode.text
if len(webcode.text) == 0:
return False
else:
return True

def findSitePath(self):#解析出网址和当前目录
res = urlparse(self.url)
self.curUrlHost = res.scheme + '://' + res.netloc
self.curUrlPath = self.curUrlHost
path = res.path
if len(path) <= 1:
curUrlPath = ''
self.curUrlPath = ''
return True
if len(path) < 5:
self.curUrlPath = path
return True
pos = path.rfind('.')
if pos == -1 and len(path) >= 5:
self.curUrlPath = path
return True
else:
pos = path.rfind('/') + 1
self.curUrlPath = path[:pos]
return True
#print(self.curUrlHost)
#print(self.curUrlPath)
#return False

def getaUrlContent(self):
conn = MySQLdb.connect('localhost', user='root', passwd='root', db='contents', charset='utf8')
# 使用cursor()方法获取操作游标
cursor = conn.cursor()
#self.catchWeb()
Bs = BeautifulSoup(self.text, 'html.parser')
aStr = Bs.find_all('a', href=True)
#path =
#mf = open('link.txt', 'w+')
for link in aStr:
aUrl = link.get('href')
if aUrl == '/':
continue
if aUrl.find('javascript:') != -1:
continue
if aUrl[0] == '/' and aUrl != '/':
aUrl = self.curUrlHost + aUrl
if aUrl[0:1] == './':
aUrl = self.curUrlHost + aUrl[1:]
if aUrl[0:2] == '../':
mPath=self.curUrlPath
xpos = mPath.rfind('/')+1
aUrl = self.curUrlHost + mPath[:xpos]
#print(aUrl + '链接地址已经存在')
print(self.curUrlHost)
print("----------" + aUrl)
sql = "select * from catlink where url ='" + aUrl + "'"
cursor.execute(sql)
if cursor.rowcount < 1:
sql = "insert into catlink(url,catched,cattime) values('" + aUrl + "',0," + str(int(time.time())) + ")"
cursor.execute(sql)
conn.commit()
#解析内容
#if(aUrl[])
htmlTitle = Bs.find('title')
Bs.body.encode('utf-8')
bodyContent =Bs.find('body')
[s.extract() for s in bodyContent('script')]
[s.extract() for s in bodyContent('a')]
[s.extract() for s in bodyContent('img')]
[s.extract() for s in bodyContent('ul')]
bodyText = bodyContent.text
bodyText =bodyText.replace('\n', '').replace('\r', '').replace('\r\n', '')
#keywordText = Bs.find(meta)
#print('文章标题:' + htmlTitle.text)
#print('关键词：' + keywordText['content'])
#print('文章内容：' + bodyText)
sql = "insert into duanluo(title,content,used) values('" + htmlTitle.text + "','" + bodyText + "',0)"
cursor.execute(sql)
conn.commit()
sql = "update catlink set catched=1 where url='" + self.url + "'"
cursor.execute(sql)
conn.commit()
#mf.close()
conn.close()

def tagClear(self):
for mTag in jihe:
[s.extract() for s in jiedian(mTag)]

def setUrl(self):
conn = MySQLdb.connect('localhost', user='root', passwd='root', db='contents', charset='utf8')
sql = 'select * from catlink where catched=0 limit 1'
cursor = conn.cursor()
cursor.execute(sql)
row = cursor.fetchone()
self.url = row[1]

def start(self):
if self.url != '':
self.catchWeb()
self.findSitePath()
self.getaUrlContent()
while True:
self.setUrl()
self.catchWeb()
self.findSitePath()
self.getaUrlContent()

url = 'http://www.woodmachine.ltd/'
r = spider(url)
r.start()

lyghost

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python写爬虫用到那些内容

python在近两年火了起来，这是因为人工智能越来越受到人们的欢迎，在网络应用中爬虫程序成为了很多企业需要使用的程序。而python用来写爬虫真的很简单。只需要用到几个库，很短的一些代码能够完成一个简易的爬虫程序。比如：我们希望爬去一个家具板网站的内容,那么就可以了写一个爬虫程序，代码如下：from bs4 import BeautifulSoupfrom urllib.parse im...
复制链接

扫一扫