Python定向爬虫——校园论坛帖子信息

最新推荐文章于 2023-10-30 22:10:36 发布

lannooooooooooo

最新推荐文章于 2023-10-30 22:10:36 发布

阅读量664

点赞数 1

分类专栏： python项目练习文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_22187919/article/details/60466283

版权

python项目练习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

引言

写这个小爬虫主要是为了爬校园论坛上的实习信息，主要采用了Requests库

源码

URLs.py

主要功能是根据一个初始url（包含page页面参数）来获得page页面从当前页面数到pageNum的url列表

import re

def getURLs(url, attr, pageNum=1):
    all_links = []
    try:
        now_page_number = int(re.search(attr+'=(\d+)', url, re.S).group(1))
        for i in range(now_page_number, pageNum + 1):
            new_url = re.sub(attr+'=\d+', attr+'=%s' % i, url, re.S)
            all_links.append(new_url)
        return all_links
    except TypeError:
        print "arguments TypeError:attr should be string."

uni_2_native.py

由于论坛上爬取得到的网页上的中文都是unicode编码的形式，文本格式都为 &#XXXX;的形式，所以在爬得网站内容后还需要对其进行转换

import sys
import re
reload(sys)
sys.setdefaultencoding('utf-8')

def get_native(raw):
    tostring = raw
    while True:
        obj = re.search('&#(.*?);', tostring, flags=re.S)
        if obj is None:
            break
        else:
            raw, code = obj.group(0), obj.group(1)
            tostring = re.sub(raw, unichr(int(code)), tostring)
    return tostring

存入SQLite数据库：saveInfo.py

# -*- coding: utf-8 -*-

import MySQLdb


class saveSqlite():
    def __init__(self):
        self.infoList = []

    def saveSingle(self, author=None, title=None, date=None, url=None,reply=0, view=0):
        if author is None or title is None or date is None or url is None:
            print "No info saved!"
        else:
            singleDict = {}
            singleDict['author'] = author
            singleDict['title'] = title
            singleDict['date'] = date
            singleDict['url'] = url
            singleDict['reply'] = reply
            singleDict['view'] = view
            self.infoList.append(singleDict)

    def toMySQL(self):
        conn = MySQLdb.connect(host='localhost', user='root', passwd='', port=3306, db='db_name', charset='utf8')
        cursor = conn.cursor()
        # sql = "select * from info"
        # n = cursor.execute(sql)
        # for row in cursor.fetchall():
        #     for r in row:
        #         print r
        #     print '\n'
        sql = "delete from info"
        cursor.execute(sql)
        conn.commit()

        sql = "insert into info(title,author,url,date,reply,view) values (%s,%s,%s,%s,%s,%s)"
        params = []
        for each in self.infoList:
            params.append((each['title'], each['author'], each['url'], each['date'], each['reply'], each['view']))
        cursor.executemany(sql, params)

        conn.commit()
        cursor.close()
        conn.close()


    def show(self):
        for each in self.infoList:
            print "author: "+each['author']
            print "title: "+each['title']
            print "date: "+each['date']
            print "url: "+each['url']
            print "reply: "+str(each['reply'])
            print "view: "+str(each['view'])
            print '\n'

if __name__ == '__main__':
    save = saveSqlite()
    save.saveSingle('网','aaa','2008-10-10 10:10:10','www.baidu.com',1,1)
    # save.show()
    save.toMySQL()

主要爬虫代码

import requests
from lxml import etree
from cc98 import uni_2_native, URLs, saveInfo

# 根据自己所需要爬的网站，伪造一个header
headers ={
    'Accept': '',
    'Accept-Encoding': '',
    'Accept-Language': '',
    'Connection': '',
    'Cookie': '',
    'Host': '',
    'Referer': '',
    'Upgrade-Insecure-Requests': '',
    'User-Agent': ''
}
url = 'http://www.cc98.org/list.asp?boardid=459&page=1&action='
cc98 = 'http://www.cc98.org/'

print "get infomation from cc98..."

urls = URLs.getURLs(url, "page", 50)
savetools = saveInfo.saveSqlite()

for url in urls:
    r = requests.get(url, headers=headers)
    html = uni_2_native.get_native(r.text)

    selector = etree.HTML(html)
    content_tr_list = selector.xpath('//form/table[@class="tableborder1 list-topic-table"]/tbody/tr')

    for each in content_tr_list:
        href = each.xpath('./td[2]/a/@href')
        if len(href) == 0:
            continue
        else:
            # print len(href)
            # not very well using for, though just one element in list
            # but I don't know why I cannot get the data by index
            for each_href in href:
                link = cc98 + each_href
            title_author_time = each.xpath('./td[2]/a/@title')

            # print len(title_author_time)
            for info in title_author_time:
                info_split = info.split('\n')
                title = info_split[0][1:len(info_split[0])-1]
                author = info_split[1][3:]
                date = info_split[2][3:]

            hot = each.xpath('./td[4]/text()')
            # print len(hot)
            for hot_num in hot:
                reply_view = hot_num.strip().split('/')
                reply, view = reply_view[0], reply_view[1]
            savetools.saveSingle(author=author, title=title, date=date, url=link, reply=reply, view=view)

print "All got! Now saving to Database..."
# savetools.show()
savetools.toMySQL()
print "ALL CLEAR! Have Fun!"