贴吧爬虫

写了一个简单的贴吧爬虫,送上代码:

python版本:2.7

#coding:utf-8
__author__ = 'zhengjinwei'

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from lib import spider
from lib import fileWriter
import time
import os


class TieBaSpider:
    def __init__(self,userAgent):
        self.spider = spider.Spider(userAgent)
        self.users = {}
        self.curFilePath = self.cur_file_dir()

    def cur_file_dir(self):
         #获取脚本路径
         path = sys.path[0]
         #判断为脚本文件还是py2exe编译后的文件,如果是脚本文件,则返回的是脚本的目录,如果是py2exe编译后的文件,则返回的是编译后的文件路径
         if os.path.isdir(path):
             return path
         elif os.path.isfile(path):
             return os.path.dirname(path)

    #get user info specified
    def getUserSex(self,userUrl):
        pattern = '''<span class="userinfo_sex userinfo_sex_(.*?)"></span><span>(.*?)</span>'''
        contents = self.spider.getContents(userUrl,pattern,2)

        # print contents
        if len(contents):
            return contents[0][0]
        else:
            return -1

    def getUserGuanZhu(self,url):
        pattern = '''<a data-fid=".*?" target="_blank" locate=".*?" href=".*?".*?class="u-f-item unsign"><span>(.*?)</span><span class="forum_level (.*?)"></span></a>'''
        contents = self.spider.getContents(url,pattern,2)
        if len(contents):
            return contents
        else:
            return -1


    def enterUsersPage(self,url):
        pattern = '''<span class="member.*?"><a href="(.*?)" class="avatar"><img src="(.*?)" alt="(.*?)"></a><div class="name_wrap"><a href="(.*?)" class="user_name" title="(.*?)">(.*?)</a>'''

        contents = self.spider.getContents(url,pattern,5)
        # print contents
        if len(contents):
            for i in range(0,len(contents)):
                userUrl =  "http://tieba.baidu.com"+contents[i][0]

                userName = contents[i][4]
                sex = self.getUserSex(userUrl)
                guanZhu = self.getUserGuanZhu(userUrl)

                if sex != -1 and guanZhu != -1:
                    userGZ = []
                    for g in range(0,len(guanZhu)):
                        userGZ.append(guanZhu[g][0])
                    self.storeUserInfo(userName,sex,userGZ)
                else:
                    pass
            return 1
        else:
            return -1


    def storeUserInfo(self,userName,sex,gz):
        if self.users.has_key(userName) == False:

           strGz = ''
           for i in range(0,len(gz)):
               strGz += '['+str(gz[i])+']'
           self.users[userName] = sex
           str1 = userName+"\t"+sex+'\t'+strGz+"\r\n"

           f = fileWriter.FileWriter(self.curFilePath,'zjw.txt','a')
           print str1
           f.write(str1)
           time.sleep(1)
           f.close()
        else:
           print "repeat user:",userName,sex

    def go(self,pageUrl):
        pattern = '''<a.*?id="member_name_link".*?href="(.*?)".*?target="_blank">(.*?)</a>.*?</p>.*?<p class="forum_info_desc">'''

        contents = self.spider.getContents(pageUrl,pattern,1)

        if len(contents):
            usersUrl = "http://tieba.baidu.com"+contents[0][0]
            # enter all users page
            self.enterUsersPage(usersUrl)

            t = True
            page = 2
            while t == True:
                pageUserUrl = usersUrl +"&pn="+str(page)
                print "prcess next page:",pageUserUrl

                ret = self.enterUsersPage(pageUserUrl)
                if ret == 1:
                    page += 1
                    print page
                else:
                    print "all page processed,",page,ret
                    break
        else:
            pass



demo = TieBaSpider('Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko')
siteUrl = u"http://tieba.baidu.com/f?kw=%E6%A2%A6%E6%83%B3%E5%B0%8F%E9%95%87&ie=utf-8&pn="


demo.go(siteUrl)


上面使用的lib库如下:


spider.py

#coding:utf-8
__author__ = 'zhengjinwei'
#coding=utf-8

import  urllib
import urllib2
import  re
import os
import math
import requests

class Spider:
    def __init__(self,userAgent):

        self.user_agent = userAgent
        self.headers = {'User-Agent' : self.user_agent}

    def getPage(self,url,decode=False):
        request = urllib2.Request(url,headers=self.headers)
        response = urllib2.urlopen(request)
        if(decode == True):
             return response.read().decode('utf-8')
        else:
            return response.read()
    def postHttp(self,url,postData):
        r = requests.post(url, data=postData)
        return r.text

    def getRawContent(self,url):
        pageContent = self.getPage(url)
        return pageContent

    def getContents(self,url,strPattern,count):
        page = self.getPage(url)

        pattern = re.compile(strPattern,re.S)

        items = re.findall(pattern,page)

        contents=[]

        for item in items:
            temArr = []
            for i in range(0,count):
                temArr.append(item[i])
            contents.append(temArr)

        return contents



fileWriter.py
__author__ = 'zhengjinwei'
#coding=utf-8
import os
import codecs


class FileWriter:
    def __init__(self,fileDir,fileName,format):
        self.mkDir(fileDir)
        self.codecs = codecs.open(fileDir+u"/"+fileName,format,'utf-8')
        self.f = open(fileDir+u"/"+fileName,format)

    def mkDir(self,path):
        isExists = os.path.exists(path)

        if not isExists:
            os.makedirs(path)
    def writeCodecs(self,contents):
        return self.codecs.write(contents)

    def write(self,contents):
        return self.f.write(contents)

    def close(self):
        self.f.close()
        self.codecs.close()

------------结束  呵呵----------------

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
你可以使用Python来编写一个简单的百度贴吧爬虫。首先,你需要安装requests和BeautifulSoup库。然后,可以按照以下步骤进行操作: 1. 导入所需的库: ```python import requests from bs4 import BeautifulSoup ``` 2. 定义一个函数来获取贴吧的页面内容: ```python def get_page(url): response = requests.get(url) if response.status_code == 200: return response.content else: return None ``` 3. 定义一个函数来解析页面内容,提取所需的信息: ```python def parse_page(content): soup = BeautifulSoup(content, 'html.parser') posts = soup.find_all('div', class_='threadlist_title') for post in posts: title = post.a.text.strip() link = post.a['href'] print('标题:', title) print('链接:', link) ``` 4. 定义一个函数来获取多个页面的内容: ```python def get_multiple_pages(base_url, num_pages): for i in range(1, num_pages+1): url = base_url + '?pn=' + str(i) content = get_page(url) parse_page(content) ``` 5. 设置爬虫的入口点: ```python if __name__ == '__main__': base_url = 'https://tieba.baidu.com/f?kw=python&ie=utf-8' num_pages = 3 # 设置要爬取的页面数量 get_multiple_pages(base_url, num_pages) ``` 在上面的代码中,我们首先定义了一个获取页面内容的函数`get_page()`,然后定义了一个解析页面内容的函数`parse_page()`,接着定义了一个获取多个页面内容的函数`get_multiple_pages()`。最后,在主函数中,我们设置了爬取的入口点。 注意:在进行任何爬取操作时,请确保你遵守网站的使用条款和法律法规,并尊重网站的隐私政策。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值