第一个爬虫程序
单线程爬虫,使用request模块,使用zip实现多个参数的for循环
# coding=utf-8
import requests
import re
html = requests.get('http://money.163.com/special/pinglun/')
text = html.text
t1 = re.findall('<div class="item_top">(.*?)">', text, re.S)
url = []
title = []
date = []
t2 = re.findall('<div class="item_top">(.*?)<ul class="mod_list">', text, re.S)
t3 = str(t2).decode('unicode-escape') #如果出现u/***的编码可以使用此编码方式
t4 = re.findall('title="(.*?)" class=', t3, re.S)
t5 = re.findall('<span class="time">(.*?)</span>', t3, re.S)
for i in t4:
title.append(i)
for i in t1:
# print i
t = re.findall('<a href="(.*)', i, re.S)
# t = re.findall('u\'(.*?)\'', t, re.S)
url.append(t)
for i in t5:
date.append(i)
# for i, j, k in title, url, date:
# print 'title:%s,' % i, 'created_at:%s,' % k, 'url:%s' %j
for (x, y, z) in zip(title, date, url):
print 'title:%s,' % x, 'created_at:%s,' % y, 'url:%s' % z
# for (x, y, z) in t6:
# print 'title:%s,' % x, 'created_at:%s,' % y, 'url:%s' % z