一个有点慢的爬虫

最新推荐文章于 2020-09-23 09:32:28 发布

u012891477

最新推荐文章于 2020-09-23 09:32:28 发布

阅读量474

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/u012891477/article/details/66479149

版权

python 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

爬一定量百度贴吧上的帖子题目和题目下一层的几点儿文字描述

主要是先分析下抑郁症患者主要在聊些什么

优点只有勉强贴近完成需求

缺点很多：

1）速度慢，几乎是5秒才一条帖子

2）不明原因卡主，还不报错，大概是设计了重试的次数，但是不知道是否起作用

3）没有加入如果ip被封的应对代码

唉

=。= 先硬着头皮上吧

linux版本

其中向mysql插入中文时出现乱码的解决方法是：在创建python里的connect里加上utf8以及mysql数据库创建表时指定utf8的charset

#!/usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import itertools
import urlparse
import time
import MySQLdb
import numpy as np
import string
from selenium import webdriver

def download(url,num_retries=2):
    print 'Downloading:',url
    try:
        driver = webdriver.PhantomJS(executable_path='/home/twq/software/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
        driver.get(url)
        time.sleep(2)
        html = driver.page_source
        driver.close()
    except:
        print 'Download error'
        if num_retries>0:
            return download(url,num_retries-1)
    return html


conn = MySQLdb.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='hello123',
        db='test',
        charset='utf8'
    )

for pageNo in np.arange(0,10000,1):
    baseURL = 'http://tieba.baidu.com/f?kw=%E6%8A%91%E9%83%81%E7%97%87&ie=utf-8&pn='+ str(pageNo*50)
    page = download(baseURL)
    pattern = re.compile('threadlist_title pull_left j_th_tit.*?<a href="/p/(.*?)" title="(.*?)" target="_blank', re.S)
    rlts = re.findall(pattern,page)
    for rlt in rlts:
        tieziUrl = 'http://tieba.baidu.com/p/'+rlt[0]
        tieziPage = download(tieziUrl)
        tieziPattern = re.compile('d_post_content j_d_post_content  clearfix">(.*?)<',re.S)
        tieziConts = re.findall(tieziPattern,tieziPage)
        tieziText = ''

        print 'page', str(pageNo), rlt[0], rlt[1]

        for tieziCont in tieziConts:
            tieziText = tieziText + tieziCont

        tieziText = tieziText.replace(' ','').replace('\n','')
        print 'content ok'

        try:
            cur = conn.cursor()
            cur.execute("insert into tmpyy values('"+rlt[0]+"','"+rlt[1]+"','"+tieziText+"')")
            cur.close()
            conn.commit()
        except:
            cur = conn.cursor()
            cur.execute("insert into tmpyy values('" + rlt[0] + "','error','error')")
            cur.close()
            conn.commit()
        print 'insert ok'

print "end"

win版本

#!/usr/bin/python
# -*- coding:utf-8 -*-
import re
import csv
import time
import numpy as np
import string
from selenium import webdriver
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

def download(url,num_retries=2):
    print 'Downloading:',url
    try:
        driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')
        driver.get(url)
        time.sleep(2)
        html = driver.page_source
        driver.close()
    except:
        print 'Download error'
        if num_retries>0:
            return download(url,num_retries-1)
    return html

for pageNo in np.arange(0,10000,3):
    baseURL = 'http://tieba.baidu.com/f?kw=%E6%8A%91%E9%83%81%E7%97%87&ie=utf-8&pn='+ str(pageNo*50)
    page = download(baseURL)
    pattern = re.compile('threadlist_title pull_left j_th_tit.*?<a href="/p/(.*?)" title="(.*?)" target="_blank', re.S)
    rlts = re.findall(pattern,page)
    for rlt in rlts:
        tieziUrl = 'http://tieba.baidu.com/p/'+rlt[0]
        tieziPage = download(tieziUrl)
        tieziPattern = re.compile('d_post_content j_d_post_content  clearfix">(.*?)<',re.S)
        tieziConts = re.findall(tieziPattern,tieziPage)
        tieziText = ''

        for tieziCont in tieziConts:
            tieziText = tieziText + tieziCont

        tieziText = tieziText.replace(' ','').replace('\n','')

        f = open("E:\\tst.txt", 'ab+')
        content = str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) + '\t' + str(rlt[0]) +'\t'+ rlt[1] +'\t'+ tieziText+'\n'
        f.write(content)
        f.close()
        print 'content ok',str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    print pageNo,'ok'

print "end"