爬一定量百度贴吧上的帖子题目和题目下一层的几点儿文字描述
主要是先分析下抑郁症患者主要在聊些什么
优点只有勉强贴近完成需求
缺点很多:
1)速度慢,几乎是5秒才一条帖子
2)不明原因卡主,还不报错,大概是设计了重试的次数,但是不知道是否起作用
3)没有加入如果ip被封的应对代码
唉
=。= 先硬着头皮上吧
linux版本
其中向mysql插入中文时出现乱码的解决方法是:在创建python里的connect里加上utf8以及mysql数据库创建表时指定utf8的charset
#!/usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import itertools
import urlparse
import time
import MySQLdb
import numpy as np
import string
from selenium import webdriver
def download(url,num_retries=2):
print 'Downloading:',url
try:
driver = webdriver.PhantomJS(executable_path='/home/twq/software/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
driver.get(url)
time.sleep(2)
html = driver.page_source
driver.close()
except:
print 'Download error'
if num_retries>0:
return download(url,num_retries-1)
return html
conn = MySQLdb.connect(
host='localhost',
port=3306,
user='root',
passwd='hello123',
db='test',
charset='utf8'
)
for pageNo in np.arange(0,10000,1):
baseURL = 'http://tieba.baidu.com/f?kw=%E6%8A%91%E9%83%81%E7%97%87&ie=utf-8&pn='+ str(pageNo*50)
page = download(baseURL)
pattern = re.compile('threadlist_title pull_left j_th_tit.*?<a href="/p/(.*?)" title="(.*?)" target="_blank', re.S)
rlts = re.findall(pattern,page)
for rlt in rlts:
tieziUrl = 'http://tieba.baidu.com/p/'+rlt[0]
tieziPage = download(tieziUrl)
tieziPattern = re.compile('d_post_content j_d_post_content clearfix">(.*?)<',re.S)
tieziConts = re.findall(tieziPattern,tieziPage)
tieziText = ''
print 'page', str(pageNo), rlt[0], rlt[1]
for tieziCont in tieziConts:
tieziText = tieziText + tieziCont
tieziText = tieziText.replace(' ','').replace('\n','')
print 'content ok'
try:
cur = conn.cursor()
cur.execute("insert into tmpyy values('"+rlt[0]+"','"+rlt[1]+"','"+tieziText+"')")
cur.close()
conn.commit()
except:
cur = conn.cursor()
cur.execute("insert into tmpyy values('" + rlt[0] + "','error','error')")
cur.close()
conn.commit()
print 'insert ok'
print "end"
win版本
#!/usr/bin/python
# -*- coding:utf-8 -*-
import re
import csv
import time
import numpy as np
import string
from selenium import webdriver
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def download(url,num_retries=2):
print 'Downloading:',url
try:
driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get(url)
time.sleep(2)
html = driver.page_source
driver.close()
except:
print 'Download error'
if num_retries>0:
return download(url,num_retries-1)
return html
for pageNo in np.arange(0,10000,3):
baseURL = 'http://tieba.baidu.com/f?kw=%E6%8A%91%E9%83%81%E7%97%87&ie=utf-8&pn='+ str(pageNo*50)
page = download(baseURL)
pattern = re.compile('threadlist_title pull_left j_th_tit.*?<a href="/p/(.*?)" title="(.*?)" target="_blank', re.S)
rlts = re.findall(pattern,page)
for rlt in rlts:
tieziUrl = 'http://tieba.baidu.com/p/'+rlt[0]
tieziPage = download(tieziUrl)
tieziPattern = re.compile('d_post_content j_d_post_content clearfix">(.*?)<',re.S)
tieziConts = re.findall(tieziPattern,tieziPage)
tieziText = ''
for tieziCont in tieziConts:
tieziText = tieziText + tieziCont
tieziText = tieziText.replace(' ','').replace('\n','')
f = open("E:\\tst.txt", 'ab+')
content = str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) + '\t' + str(rlt[0]) +'\t'+ rlt[1] +'\t'+ tieziText+'\n'
f.write(content)
f.close()
print 'content ok',str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
print pageNo,'ok'
print "end"