python 抓取网页 库_python-爬虫-自带库抓取网页内容

python-爬虫-自带库抓取网页内容

版本:Python 2.7.10

python-爬虫-自带库抓取网页内容

版本:Python 2.7.10

# -*- coding: utf-8 -*-

import sys

import lxml

import requests

import codecs

import time

from lxml import etree

from lxml.html.clean import Cleaner

reload(sys)

sys.setdefaultencoding('utf8') # 设置默认编码格式为'utf-8'

#防止 print 不能输出中文

if sys.stdout.encoding != 'UTF-8':

sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')

if sys.stderr.encoding != 'UTF-8':

sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')

demo_file="/Library/temp/demo.txt"

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def save_tags(url):

print '请求',url

resp = requests.get(url) #请求

print '请求完成'

if not resp:

print '无响应内容'

return

txt = resp.text

print 'txt = ',type(txt)

dom = etree.HTML(txt)

print 'dom type = ',type(dom)

xpath = '//div[@class=\'myClassName\']/a' //xpath

result = dom.xpath(xpath)

if not result or len(result)<1:

print 'xpath = ',xpath,' 无内容'

return

n=0

print '系统默认编码:',sys.getdefaultencoding()

print '准备写入文件:',demo_file

f = codecs.open(demo_file,'a+','utf-8')

f.write('###{}\n'.format(url))

for t in result:

n=n+1

txt = t.text.strip()

print n,txt

f.write('{}\n'.format(txt))

f.close()

print '写入文件结束:{}'.format(f.name)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def run():

url_base = 'http://www.xxx.net/list?page='

page_index=1

end = 863

print '爬取开始'

for i in range(page_index,end+1):

print '*'*30

url = '{}{}'.format(url_base,i)

save_tags(url)

print 'sleep 中...'

time.sleep(3)

print '*'*30

print '爬取结束'

run()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值