python3读取网页内容_Python3爬虫04(其他例子,如处理获取网页的内容)

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import os

import re

import requests

from bs4 import NavigableString

from bs4 import BeautifulSoup

res=requests.get("https://www.qiushibaike.com/")

qiushi=res.content

soup=BeautifulSoup(qiushi,"html.parser")

duanzis=soup.find_all(class_="content")

for i in duanzis:

duanzi=i.span.contents[0]

# duanzi=i.span.string

print(duanzi)

# print(i.span.string)

res=requests.get("http://699pic.com/sousuo-218808-13-1-0-0-0.html")

image=res.content

soup=BeautifulSoup(image,"html.parser")

images=soup.find_all(class_="lazy")

for i in images:

original=i["data-original"]

title=i["title"]

# print(title)

# print(original)

# print("")

try:

with open(os.getcwd()+"\\jpg\\"+title+'.jpg','wb') as file:

file.write(requests.get(original).content)

except:

pass

r = requests.get("http://699pic.com/sousuo-218808-13-1.html")

fengjing = r.content

soup = BeautifulSoup(fengjing, "html.parser")

# 找出所有的标签

images = soup.find_all(class_="lazy")

# print images # 返回list对象

for i in images:

jpg_rl = i["data-original"] # 获取url地址

title = i["title"] # 返回title名称

print(title)

print(jpg_rl)

print("")

r = requests.get("https://www.qiushibaike.com/")

r=requests.get("http://www.cnblogs.com/nicetime/")

blog=r.content

soup=BeautifulSoup(blog,"html.parser")

soup=BeautifulSoup(blog,features="lxml")

print(soup.contents[0].contents)

tag=soup.find('div')

tag=soup.find(class_="menu-bar menu clearfix")

tag=soup.find(id="menu")

print(list(tag))

tag01=soup.find(class_="c_b_p_desc")

print(len(list(tag01.contents)))

print(len(list(tag01.children)))

print(len(list(tag01.descendants)))

print(tag01.contents)

print(tag01.children)

for i in tag01.children:

print(i)

print(len(tag01.contents))

for i in tag01:

print(i)

print(tag01.contents[0].string)

print(tag01.contents[1])

print(tag01.contents[1].string)

url = "http://www.dygod.net/html/tv/oumeitv/109673.html"

s = requests.get(url)

print(s.text.encode("iso-8859-1").decode('gbk'))

res = re.findall('href="(.*?)">ftp',s.text)

for resi in res:

a=resi.encode("iso-8859-1").decode('gbk')

print(a)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值