本篇是我第一次利用bs写的爬虫代码,爬取网址:http://beijing.anjuke.com/tycoon/p1/
每页的网址变量是p后的数字,可能因为这是老早之前写的代码了,所以现在一看,发现并没有什么难的,掌握基本要素即可。
废话不多说,直接上代码吧!
#encoding=utf8
import re
import urllib
import urllib2
from bs4 import BeautifulSoup
for i in range(1,56):
url='http://beijing.anjuke.com/tycoon/p'+str(i)
user_agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:49.0) Gecko/20100101 Firefox/49.0"
headers={"User-Agent":user_agent}
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
html=response.read()
bs=BeautifulSoup(html,"html.parser")
data=bs.find_all("div",class_='jjr-itemmod')
for xinxi in data:
a=xinxi.find('div',class_='jjr-info').get_text("|",strip=True).encode('utf-8'+'|'+'\n')
a=a.replace(' ','')
a=a.replace('\n','')
print a
f=xinxi.find('div',class_='jjr-side').get_text("|",strip=True).encode('utf8')
print f
mm=open('k9p.txt','a+')
mm.write(a+"|"+f+"\n")
mm.close()