本次爬虫只是为了简单回顾一下request和xpath的用法,便于以后回顾。
import requests
from lxml import etree
import time
urls = ["https://music.douban.com/top250?start={}".format(str(i)) for i in range(0, 250, 25)]
headers = {'user-agent':'Mozilla/5.0'}
lit = []
for url in urls:
res = requests.get(url=url, headers=headers)
selector = etree.HTML(res.text)
info_list = selector.xpath('//div[@class="indent"]/table')
for i in info_list:
dic = {}
dic['name'] = i.xpath('.//div/a/text()')[0]
info = i.xpath('.//div/p/text()')[0]
dic['songer'] = info.split('/')[0]
dic['date'] = info.split('/')[1]
dic['song_type'] = info.split('/')[-1]
lit.append(dic)
time.sleep(3)
import pandas as pd
df = pd.DataFrame(lit)
df.head()
| name | songer | date | song_type |
---|
0 | \n We Sing. We Dance. We Steal Thin... | Jason Mraz | 2008-05-13 | 民谣 |
---|
1 | \n Viva La Vida\n | Coldplay | 2008-06-17 | 摇滚 |
---|
2 | \n 华丽的冒险\n | 陈绮贞 | 2005-09-23 | 流行 |
---|
3 | \n 范特西\n | 周杰伦 | 2001-09-14 | 流行 |
---|
4 | \n 後。青春期的詩\n | 五月天 | 2008-10-23 | 摇滚 |
---|
df['name'] = df['name'].apply(lambda x: x.strip())
df.head()
| name | songer | date | song_type |
---|
0 | We Sing. We Dance. We Steal Things. | Jason Mraz | 2008-05-13 | 民谣 |
---|
1 | Viva La Vida | Coldplay | 2008-06-17 | 摇滚 |
---|
2 | 华丽的冒险 | 陈绮贞 | 2005-09-23 | 流行 |
---|
3 | 范特西 | 周杰伦 | 2001-09-14 | 流行 |
---|
4 | 後。青春期的詩 | 五月天 | 2008-10-23 | 摇滚 |
---|