本文仅作为学习笔记记录,若有侵权,即刻删除!
案例一:
爬取redbull公司名称、地址、邮编及电话:
import requests
import re
import bs4
import pandas as pd
url = "http://redbull.com.cn/about/branch"
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text)
# company = re.findall('<h2>(.*?)</h2>', response.text)
# add = re.findall('<p class=\'mapIco\'>(.*?)</p>', response.text)
company = [i.text for i in soup.findAll(name = 'h2')]
add = [i.text for i in soup.findAll(name = 'p', attrs = {'class':'mapIco'})]
mail = [i.text for i in soup.findAll(name = 'p', attrs = {'class':'mailIco'})]
tel = [i.text for i in soup.findAll(name = 'p', attrs = {'class':'telIco'})]
pd.DataFrame({'company':company, 'add':add, 'mail':mail, 'tel':tel})
案例二:
链家二手房信息,一键爬取并保存为csv
import requests
import bs4
import pandas as pd
huxing = []
area = []
style = []
name = []
price = []
for i in range(1,3):
url = r'https://su.lianjia.com/ershoufang/pg' + str(i) + '/'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text)
# xq = [i.text for i in soup.findAll(name = 'div', attrs = {'class':'houseInfo'})]
# ['4室2厅 | 124.66平米 | 南 北 | 精装 | 低楼层(共15层) | 板楼',
# '2室2厅 | 99.28平米 | 南 | 精装 | 中楼层(共25层) | 板楼']
# xq得到的为以| 分隔的列表,需手动分割出来
huxing.extend([i.text.split('|')[0].strip() for i in soup.findAll(name = 'div', attrs = {'class':'houseInfo'})])
area.extend([i.text.split('|')[1].strip() for i in soup.findAll(name = 'div', attrs = {'class':'houseInfo'})])
style.extend([i.text.split('|')[3].strip() for i in soup.findAll(name = 'div', attrs = {'class':'houseInfo'})])
name.extend([i.text for i in soup.findAll(name = 'a', attrs = {'data-el':'region'})])
price.extend([i.text for i in soup.findAll(name = 'div', attrs = {'class':'totalPrice'})])
df = pd.DataFrame({'name':name, 'huxing':huxing, 'area':area, 'style':style, 'price':price})
df.to_csv('ershoufang_3.csv', encoding="utf_8_")