#coding=utf-8
import urllib2
import httplib
import re
from pybloomfilter import BloomFilter
import StringIO
import os
import gzip
import zlib
import lxml
from lxml import html
from lxml import etree
import pandas as pd
from bs4 import BeautifulSoup
request_headers = {
'Accept':"image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
'Accept-Language':"zh-CN,zh;q=0.8",
'Connection':"keep-alive",
'Referer':"http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard",
'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Mobile Safari/537.36"
}
# 创建 Bloom Filter
download_bf = BloomFilter(1024 * 1024 * 16, 0.01)
url = 'http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard'
req = urllib2.Request(url, headers=request_headers)
response = urllib2.urlopen(req)
htmlcontent = response.read()
#如果是gzip解码的话,怕出现乱码,要用下面三行
gzipped = response.headers.get('Content-Encoding')
if gzipped:
htmlcontent = zlib.decompress(htmlcontent, 16+zlib.MAX_WBITS)
print htmlcontent
soup = BeautifulSoup(htmlcontent, 'lxml')
urls=[]
news_content=[]
# a=soup.select('div.cm_news_main > ul.cm_ul_round > li > a ')
# print a
# ul_contents=soup.select('ul[class="cm_ul_round ul_page1"] > li > a')
# print ul_contents
for link in soup.select('div.aslide > a'):
urls.append(link.get('href'))
news_content.append(link.text)
print urls
for i in news_content:
print i
print len(news_content)
# for link in soup.select('div.ndi_main > h3 > a'):
#
# urls.append(link.get('href'))
# news_content.append(link.text)
# print urls
# print len(news_content)
爬取北京这个页面的网页信息
最新推荐文章于 2020-11-27 22:22:44 发布