#coding=utf-8
import urllib
from bs4 import
BeautifulSoup
import codecs
file1 = codecs.open('1.txt',
'w')
content =
urllib.urlopen('http://mp.weixin.qq.com/s?__biz=MTI0MDU3NDYwMQ==&mid=406948985&idx=1&sn=042190493d6a1f9b5213e53bba215e51&3rd=MzA3MDU4NTYzMw==&scene=6#rd').read()
soup =
BeautifulSoup(content)
# title =
soup.find('h2').get_text()
# titletime =
soup.find('em').get_text()
u'内容:'+soup.find('title').get_text() +' ' +
u'时间:'+soup.find('em').get_text()
'''
# soup
就是BeautifulSoup处理格式化后的字符串,
soup.title 得到的是title标签,
soup.p
得到的是文档中的第一个p标签,要想得到所有标签,得用find_all函数。
find_all
函数返回的是一个序列,可以对它进行循环,依次得到想到的东西
.get_text()
是返回文本,这个对每一个BeautifulSoup处理后的对象得到的标签都是生效的。
你可以试试 print
soup.p.get_text()
'''
==================================================================
# -*-coding: utf8-*-
from bs4 import BeautifulSoup as bs
import requests
url =
r'http://mp.weixin.qq.com/s?__biz=MTI0MDU3NDYwMQ==&mid=406948985&idx=1&sn=042190493d6a1f9b5213e53bba215e51&3rd=MzA3MDU4NTYzMw==&scene=6#rd'
headers = {'User-agent': 'Mozilla/5.0(compatible; MSIE 10.0;
windows NT 6.1; wow64; Trident/6.0)',
'Aceept-Encoding': 'gzip, deflate',
'Accent-Language': 'en-US, en; q=0.5',
'Connection': 'keep-alive'
}
response = requests.get(url, headers=headers)
soup = bs(response.content.decode('utf-8'), 'lxml')
title = soup('title')[0].text
time = soup('em')[0].text
print u'标题:%s\r\n时间:%s' % (title, time)