爬取WAP 百度贴吧和凤凰财经的A股列表

这两个爬虫都是三个月前刚开始接触PYTHON时仿照GITHUB案例写的,写的比较乱,也没有IP代理和停机TIME SLEEP,第一个用到了XPATH,后面一个用了正则表达式。到现在我依然记得第一次成功按自己的正则匹配到数据的兴奋。至于保存数据,前者是保存到了TXT文档,后者存到了数据库MYSQL的各列。

现在爬过那么网站后,我会选择REQUESTS 包和BEAUTIFUL SOUP4包,这两个方法是真的很方便。

------------

爬取WAP百度贴吧,保存到TXT




# -*- coding:utf-8 -*-


import sys


reload(sys)


sys.setdefaultencoding('utf-8')


import urllib


import urllib2


import requests


import sys


reload(sys)


sys.setdefaultencoding("utf-8")






from lxml import etree


import json










def spider():


i=0


yulu=[]


for i in range(0,38):


url='http://tieba.baidu.com/mo/q---E69E1F2CE3B3F602E8A4E9DBB498F420%3AFG%3D1--1-1-0--2--wapp_1492841547755_799/m?kz=4668253092&new_word=&pn={}0&lp=6005'.format(i)


html=requests.get(url)


select=etree.HTML(html.content)


content_field=select.xpath('//div[@class="d"]')


print u'新一页'






items=list()


for each in content_field:


content=each.xpath('//div[@class="i"]/text()')






items=content


for j in range(0,30):


print items[j]


yulu.append(items[j])


print yulu


with open('data2.txt', 'wb') as f:


for item in yulu:


line =item + '\n'


f.write(line.encode('utf-8'))










spider()




------------


爬取凤凰财经的A股列表


#!/usr/bin/env python


# -*- coding:utf-8 -*-


import urllib


import urllib2


import re


import thread


import time


import MySQLdb






class FH:






def __init__(self):


self.pageIndex = 1


self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64)'


self.headers = {'User-Agent' :self.user_agent}


self.list = []










def getPage(self,pageIndex):


try:


url = 'http://app.finance.ifeng.com/list/stock.php?t=ha&f=chg_pct&o=desc&p='+ str(pageIndex)


request = urllib2.Request(url,headers=self.headers)


response = urllib2.urlopen(request)


pageCode = response.read().decode('utf-8')


return pageCode


except urllib2.URLError,e:


if hasattr(e,"reason"):


print "error",e.reason


return None






def getPageItems(self,pageIndex):


pageCode = self.getPage(pageIndex)


if not pageCode:


print "page load error"


return None


pattern = re.compile('<td><a href="(.*?)" target="_blank">(.*?)</a></td>.*?target="_blank">(.*?)</a></td>',re.S)






items = re.findall(pattern,pageCode)


pagelist = []


for item in items:


pagelist.append([item[0].strip(),item[1].strip(),item[2].strip()])


print(item[0])


print(item[1])


print(item[2])


conn= MySQLdb.connect(


host='localhost',


port = 3306,


user='root',


passwd='94159415',


db ='movie',


charset='utf8'


)


cur = conn.cursor()






cur.execute("insert into A_STOCK_LIST VALUES (NULL,'%s','%s','%s')"%(item[0],item[1],item[2]))






cur.close()


conn.commit()


conn.close()


return pagelist






def loadPage(self):


if len(self.list)<2:


pagelist = self.getPageItems(self.pageIndex)


if pagelist:


self.list.append(pagelist)


self.pageIndex +=1










def start(self):


print u'正在读取'


self.loadPage()


nowPage = 0


pagelist = self.list[0]


while nowPage<24:


nowPage +=1


del self.list[0]


self.loadPage()










spider = FH()


spider.start()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值