python采集修改原创_python 采集爬虫 demo 源代码 ulunwen 原创

python 采集爬虫 demo 源代码 ulunwen 原创 禁止转载

#! /usr/bin/python

# -*- coding: gb2312 -*-

from bs4 import BeautifulSoup

import re

import urllib

import csv

import time,sys

realpage=[]

def save_csv(linedata=[]):

csvfile = file('data.txt','a')

writer = csv.writer(csvfile)

writer.writerows(linedata)

csvfile.close()

def GetPageText(links=[]):

pageinfo=[]

detailinfo=[]

for link in links:

PageContent = urllib.urlopen(link).read()

pageContent = unicode(PageContent, "gb2312").encode("utf8")

PageSoup = BeautifulSoup(PageContent,'html.parser')

#Get page title

time.sleep(1)

detailinfo.append(link)

detailinfo.append(re.compile(r'<[^>]+>',re.S).sub('',''.join(PageSoup.title)))

detailinfo.append(repr(PageSoup.title.next_sibling.next_sibling.get('content')))

#desc= repr(PageSoup.title.next_sibling.next_sibling.get('content')))

time.sleep(1)

print "Start write file"

print detailinfo

raw_input()

save_csv(detailinfo)

time.sleep(1)

#print pageinfo

#save_csv(pageinfo)

def webopen(link):

content = urllib.urlopen(link).read()

soup = BeautifulSoup(content,'html.parser').find('div',class_='catalog05 catalog05d')

alink = soup.find_all('a', {'target':'_blank'})

for al in alink:

urlhref=al.get('href')

if "html" in urlhref:

print urlhref

realpage.append(urlhref)

else:

continue

GetPageText(realpage)

#link = ["index.html"]

link = 'http://ulunwen.com'

webopen(link)

#GetPageText(link)

5C042A7F-FE81-42F9-9233-49D5DED958A8.jpg

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值