python3学习爬虫正则以及url

最新推荐文章于 2020-03-20 11:14:35 发布

我的球被偷了

最新推荐文章于 2020-03-20 11:14:35 发布

阅读量647

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/ghoiufyia/article/details/50170215

版权

python 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

#coding=utf8
__author__ = 'Administrator'

import os
import re
import urllib.request
import pymysql

class Spider:
    #页面初始化
    def __init__(self,url,retext,path):
        self.url = url
        self.path = path
        self.retext = retext

    def mkdir(self,path):
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        return path

    def getData(self):
        url = urllib.request.Request(self.url)
        html = urllib.request.urlopen(url).read()
        print(html)
        html = html.decode('utf-8','ignore')
        imgRe = re.compile(self.retext)
        data = imgRe.findall(html)
        return data
        #self.data = data

    def saveImg(self,imgurl,imgname):
        #img = urllib.request.urlopen(imgurl).read()
        #img = urllib.request.urlopen(imgurl)
        #print(img)
        path = self.path
        try:
            img = urllib.request.urlopen(imgurl)
            #print(img)
        except Exception as e:
            print(e)
        else:
            img = img.read()
            f = open("./%s/%s.jpg" %(path,imgname),'wb')
            f.write(img)
            f.close()

    def saveMysql(self,title,url,catogary,content):
        try:
            con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8')
            cur=con.cursor()
            #cur.execute('select * from imgurl')
            #data=cur.fetchall()
            insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content)
            #print(insert)
            cur.execute(insert)
            cur.close()#关闭游标
            con.close()#释放数据库资源
        except  Exception as e:
            print("发生异常:%s"%e)

    def getContent(self):
        path = self.mkdir(self.path)
        data = self.getData()
        #print(data)
        fp = open('./%s/url.txt'%(path),'w+')
        x = 0
        for d in data:
            print(d)
            fp.write(d)
            if (len(d) < 80) :
                #self.saveImg(d,x)
                #print(d)
                x+=1
        fp.close()


url = "http://www.zhihu.com/question/29649162"
url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1'
retext = r'http://.*?\.jpg|http://.*?\.png'



retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>'
spider = Spider(url,retext2,"赵丽颖")
spider.getContent()

我的球被偷了

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python3学习爬虫正则以及url

#coding=utf8__author__ = 'Administrator'import osimport reimport urllib.requestimport pymysqlclass Spider: #页面初始化 def __init__(self,url,retext,path): self.url = url sel
复制链接

扫一扫