爬取爱问知识人问题并保存到数据库

最新推荐文章于 2020-01-13 21:15:54 发布

遗忘了呵呵

最新推荐文章于 2020-01-13 21:15:54 发布

阅读量714

点赞数 1

分类专栏：网络爬虫文章标签：爬虫数据库正则博客

本文链接：https://blog.csdn.net/sinat_36802840/article/details/53823246

版权

网络爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

前言：
基于崔庆才的（http://cuiqingcai.com/1972.htmlPython）爬虫实战六之抓取爱问知识人问题并保存至数据库
上面博客由于显示问题，可能看不到全部代码，可以通过查看网页源代码看到。

自己就在题目上进行一些修改，去掉的保存所有答案，只保存好评答案，而且没有使用到Beautifulsoup，所以直接采取正则表带是匹配，所以tool.py也没有作用。
最终保存问题，提问者，回答者，回答者URL，回答时间，回答内容。

学习到的：
1.将缓冲区设置输出到log中，在程序的最前面加上这两句即可.f_handler=open('out.log', 'w') sys.stdout=f_handler
这样，所有的print语句输出的内容就会保存到out.log文件中了。
实例可以看下面的main函数
2.tool.py这个很通用的页面处理模块
3.正则表达式的贪婪与不贪婪的不同作用

patterntime1 = re.compile(ur'<div class="ask_autho clearfix">.*<span class="ask-time mr10">(.*?)</span>', re.S)

该来匹配提问时间，其中就有贪婪与不贪婪两种模式
具体就是贪婪可以通配很多中间字符，直接到后面的想匹配的字符串必须要出现这个<span class="这个一连串的字符才行
而不贪婪就是到通配到下个字符<就可也以

注意事项：
1.数据库需要自己先创建好database和table，并且要和传入my_dict一一对应
2.注意编码和解码方式，本次都是用utf-8
3.被注释的地方都是用来调试

下面是这次的代码
aiwentomysql.py

# -*- coding:utf-8 -*-

import MySQLdb
import urllib2
import urllib
import re
import time
import sys
from bs4 import BeautifulSoup
import tool
import Mysql

class Spider(object):
    def __init__(self):
        self.URL = 'http://iask.sina.com.cn/c/187-all-1-new.html'
        #pageURllist存URL
        self.pageURllist = []
        #questiURllist
        self.questionsfullURL = []
        self.total_pagenum = None
        self.mysql = Mysql.Mysql()

    #得到某一页的内容
    def get_page(self,pageURL):
        request = urllib2.Request(pageURL)
        response = urllib2.urlopen(request)
        pagecontents = response.read().decode('utf-8')
        return pagecontents

    #得到目录有多少页
    def gettotalPageNum(self):
        pageURL = self.URL
        pagecontents = self.get_page(pageURL)
        pattern = re.compile(ur'<div class="pages"  pageCount="(.*?)".*?>', re.S)
        self.total_pagenum = re.search(pattern, pagecontents).group(1)
        return  True

    #在该页下得到所有问题的URL以及下一页的URL,pageURL为目录页的URL，num为该链接的位置在第几页
    def getquestionsURL(self,pageURL,num):
        pagecontents = self.get_page(pageURL)
        #得到问题的URL，存为questifullURL
        pattern = re.compile(r'<li class="list">.*?<div class="question-title">.*?<a href="(.*?)" target', re.S)
        questioninitalURl = re.findall(pattern, pagecontents)
        #print 'self.questionsfullURL插入问题的URL:'
        for URL in questioninitalURl:
            temURL = 'http://iask.sina.com.cn' + URL
            if temURL not in self.questionsfullURL:
                self.questionsfullURL.append(temURL)
                #print temURL

        #得到下一页的URL
        if num < int(self.total_pagenum):
            patternnextpage = re.compile(ur'<div class="pages".*?<a href=".*?" style=.*?>.*?</a>.*<a href="(.*?)" style="width: 65px">下一页</a>', re.S)
            nextpageURL = re.search(patternnextpage, pagecontents)
            if nextpageURL != None:
                tempageURL = 'http://iask.sina.com.cn/' + nextpageURL.group(1)
                if tempageURL not in self.pageURllist:
                    self.pageURllist.append(tempageURL)
                    print 'self.pageURllist插入第',str(num+1),'页的URL:' ,tempageURL
                    return True
            else:
                print '不能匹配到下一页URL'
                return False
        else:
            return True

    def get_questiondetail(self,questionurl):
        questioncontents =self.get_page(questionurl)
        #print '0'
        #问题
        patternquestion = re.compile(ur'<div class="question_text">.*?<pre style=.*?>(.*?)</pre>', re.S)
        question = re.search(patternquestion, questioncontents).group(1)
        #print '1'
        # 提问时间
        patterntime1 = re.compile(ur'<div class="ask_autho clearfix">.*<span class="ask-time mr10">(.*?)</span>', re.S)
        time1 = re.search(patterntime1, questioncontents).group(1)
        #print '2'
        #回答者
        patternanswerer = re.compile(ur'<div class="answer_tip clearfix">.*?<a href="(.*?)".*?>(.*?)</a>.*<span class="time mr10">(.*?)</span>', re.S)
        answer = re.search(patternanswerer, questioncontents)
        #print '3'
        if answer != None:
            answerURL = answer.group(1)
            answername = answer.group(2)
            answertime = answer.group(3)
            #回答内容
            pattern3 = re.compile(ur'<div class="answer_text">.*?<pre style=.*?">(.*?)</pre>', re.S)
            answercon = re.search(pattern3,questioncontents).group(1)
            ques_dict = {
                'question': question,
                'time': time1,
                'answerer': answername,
                'answererURL': answerURL,
                'answertime': answertime,
                'answertext': answercon,
                'questionURL': questionurl}
            insert_ID = self.mysql.insertData("iaskanswer",ques_dict)
            print '保存最佳答案成功,ID:',insert_ID
            #print question,time1,answername,answerURL,answertime,answercon,questionurl
            return True
        else:
            print '没有最佳答案，跳过此问题'
            return None

    def start(self):
        print '爬虫正在启动，开始爬取:',time.ctime()
        self.pageURllist.append(self.URL)
        self.gettotalPageNum()
        Totalpagenum = self.total_pagenum
        print '获取到目录共有' + str(Totalpagenum)+ '页:', time.ctime()
        for i in range(int(Totalpagenum)):
            print '开始读取第'+ str(i+1)+'页:', time.ctime()
            pageURL = self.pageURllist[i]
            if self.getquestionsURL(pageURL, i+1):
                print '这页读取完毕'
            else:
                break
        print '所有页都爬取完毕:', time.ctime(), '\n'

        print '开始爬取问题:' ,time.ctime()
        questionsnum = len(self.questionsfullURL)
        print '共有' + str(questionsnum)+ '问题:' ,time.ctime()
        for i in range(questionsnum):
            print '正在读取第' + str(i + 1) + '个问题:'
            temquestionURL = self.questionsfullURL[i]
            print temquestionURL
            self.get_questiondetail(temquestionURL)
        print '所有问题读取成功', time.ctime()
        print '正在结束爬虫...'
        print '爬虫结束：',time.ctime()
        return True

def main():
    #f_handler = open('out.log', 'w')
    #sys.stdout = f_handler
    spider = Spider()
    spider.start()

if __name__ == '__main__':
    main()

tool.py

#  -*- coding:utf-8 -*-
import re

#处理页面标签类
class Tool:
    #去除img标签,7位长空格
    removeImg = re.compile('<img.*?>| {7}|')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #将表格制表<td>替换为\t
    replaceTD= re.compile('<td>')
    #把段落开头换为\n加空两格
    replacePara = re.compile('<p.*?>')
    #将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    #将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replacePara,"\n    ",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        #strip()将前后多余内容删除
        return x.strip()

Mysql.py

# -*- coding:utf-8 -*-
import MySQLdb
import time

class Mysql(object):

    def __init__(self):
        try:
            self.db = MySQLdb.connect('127.0.0.1','root','********','tests')
            print '链接数据库test成功'
            self.cur = self.db.cursor()
        except MySQLdb.Error, e:
            print time.ctime(),'链接数据库错误，原因%d: %s ' % (e.args[0], e.args[1])

    def insertData(self,table, my_dict):
        try:
            self.db.set_character_set('utf8')
            cols = ','.join(my_dict.keys())
            values = '","'.join(my_dict.values())
            sql = "INSERT INTO %s (%s) VALUES (%s)"  % (table, cols, '"'+values+'"')
            try:
                result = self.cur.execute(sql)
                insert_id = self.db.insert_id()
                self.db.commit()
                if result:
                    return insert_id
                else:
                    return 0
            except MySQLdb.Error, e:
                self.db.rollback()
                if "key 'PRIMARY' " in e.args[1]:
                    print time.ctime(), '数据已存在'
                else:
                    print time.ctime(),'插入数据失败，原因 %d: %s' % (e.args[0], e.args[1])
        except MySQLdb.Error, e:
            print time.ctime(), "数据库错误，原因%d: %s" % (e.args[0], e.args[1])

遗忘了呵呵

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取爱问知识人问题并保存到数据库

前言：基于崔庆才的（http://cuiqingcai.com/1972.htmlPython）爬虫实战六之抓取爱问知识人问题并保存至数据库上面博客由于显示问题，可能看不到全部代码，可以通过查看网页源代码看到。自己就在题目上进行一些修改，去掉的保存所有答案，只保存好评答案，而且没有使用到Beautifulsoup，所以直接采取正则表带是匹配，所以tool.py也没有作用。最终保存问题，提
复制链接

扫一扫