#!/usr/bin/env python3
# coding=utf-8
import urllib.request as url
#打开网页获得一个文件对象
web_file = url.urlopen('http://www.baidu.com');
#依次读取文件中的一行
for line in web_file:
print(line)
#输出文件对象,而不是文件的内容
print(web_file)
1.2 使用正则表达式过滤图片url
import re
import urllib.request as url
context = url.openurl("http://www.baidu.com").read()
imgs = re.findall("<img.*src=.*?>",context);
for img in imgs:
re.findall("src")
1.3 使用BeautifulSoup过滤图片标签
from bs4 import BeautifulSoup
import urllib.request as urlrq
def openurl_and_getsoup(url):
web = urlrq.urlopen(url)
return BeautifulSoup(str(web.read()),"lxml")
url = "http://www.baidu.com"
soup = openurl_and_getsoup(url)
#获取所有的img标签及内容并存储在list中
all_img = soup.find_all(['img'])
#soup.img['src']#可以获取一个图片的url
print(soup.img)
print(all_img)