这两个爬虫都是三个月前刚开始接触PYTHON时仿照GITHUB案例写的,写的比较乱,也没有IP代理和停机TIME SLEEP,第一个用到了XPATH,后面一个用了正则表达式。到现在我依然记得第一次成功按自己的正则匹配到数据的兴奋。至于保存数据,前者是保存到了TXT文档,后者存到了数据库MYSQL的各列。
现在爬过那么网站后,我会选择REQUESTS 包和BEAUTIFUL SOUP4包,这两个方法是真的很方便。
------------
爬取WAP百度贴吧,保存到TXT
# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import urllib
import urllib2
import requests
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from lxml import etree
import json
def spider():
i=0
yulu=[]
for i in range(0,38):
url='http://tieba.baidu.com/mo/q---E69E1F2CE3B3F602E8A4E9DBB498F420%3AFG%3D1--1-1-0--2--wapp_1492841547755_799/m?kz=4668253092&new_word=&pn={}0&lp=6005'.format(i)
html=requests.get(url)
select=etree.HTML(html.content)
content_field=select.xpath('//div[@class="d"]')
print u'新一页'
items=list()
for each in content_field:
content=each.xpath('//div[@class="i"]/text()')
items=content
for j in range(0,30):
print items[j]
yulu.append(items[j])
print yulu
with open('data2.txt', 'wb') as f:
for item in yulu:
line =item + '\n'
f.write(line.encode('utf-8'))
spider()
------------
爬取凤凰财经的A股列表
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import thread
import time
import MySQLdb
class FH:
def __init__(self):
self.pageIndex = 1
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64)'
self.headers = {'User-Agent' :self.user_agent}
self.list = []
def getPage(self,pageIndex):
try:
url = 'http://app.finance.ifeng.com/list/stock.php?t=ha&f=chg_pct&o=desc&p='+ str(pageIndex)
request = urllib2.Request(url,headers=self.headers)
response = urllib2.urlopen(request)
pageCode = response.read().decode('utf-8')
return pageCode
except urllib2.URLError,e:
if hasattr(e,"reason"):
print "error",e.reason
return None
def getPageItems(self,pageIndex):
pageCode = self.getPage(pageIndex)
if not pageCode:
print "page load error"
return None
pattern = re.compile('<td><a href="(.*?)" target="_blank">(.*?)</a></td>.*?target="_blank">(.*?)</a></td>',re.S)
items = re.findall(pattern,pageCode)
pagelist = []
for item in items:
pagelist.append([item[0].strip(),item[1].strip(),item[2].strip()])
print(item[0])
print(item[1])
print(item[2])
conn= MySQLdb.connect(
host='localhost',
port = 3306,
user='root',
passwd='94159415',
db ='movie',
charset='utf8'
)
cur = conn.cursor()
cur.execute("insert into A_STOCK_LIST VALUES (NULL,'%s','%s','%s')"%(item[0],item[1],item[2]))
cur.close()
conn.commit()
conn.close()
return pagelist
def loadPage(self):
if len(self.list)<2:
pagelist = self.getPageItems(self.pageIndex)
if pagelist:
self.list.append(pagelist)
self.pageIndex +=1
def start(self):
print u'正在读取'
self.loadPage()
nowPage = 0
pagelist = self.list[0]
while nowPage<24:
nowPage +=1
del self.list[0]
self.loadPage()
spider = FH()
spider.start()