龙珠直播信息前50条爬取
import urllib.request as ur
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
def open_url(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = ur.Request(url=url, headers=headers) # python2,urllib.request()
response = ur.urlopen(req) # python2,urllib2.urlopen()
return response.read()
def set_all(url):
data = open_url(url).decode('utf-8')
soup1 = BeautifulSoup(data, 'html.parser')
soup = soup1.findAll("div", attrs={"class": "list-con"})
res = [x for x in tuple(soup)][1].findAll("a")
return res
def sigle_jx(x):
partern = '''.*<a class="livecard" (.*?)="(.*?)" (.*?)="(.*?)" (.*?)="text:(.*?),(.*?):(.*?),(.*?):(.*?),(.*?):(.*?)" (.*?)="(.*?)" (.*?)="(.*?)" target="_blank">.*'''
t1 = [y for y in re.findall(partern, repr(x))[0]]
# the head of link-info
res = {}
for i in range(int(len(t1) / 2 - 1)):
res.setdefault(t1[2 * i], t1[2 * i + 1])
imgs = [re.findall('.*src="(.*?)".*', repr(x)) for x in x.findAll("img")]
see_num, game = tuple([x.next_element for x in tuple(x.findAll("span", attrs={"class": "livecard-meta-item-text"}))])
username = x.find("strong", attrs={"class": "livecard-modal-username"}).next_element
try:
badge = x.find("span", attrs={"class": "livecard-badge"}).next_element
except AttributeError:
badge = "无描述"
res.setdefault("see_num", see_num)
res.setdefault("game", game)
res.setdefault("zhubo", username)
res.setdefault("desc", badge)
res.setdefault("imgs", imgs)
return res
if __name__ == '__main__':
now = datetime.today()
num = 0
url = '''http://longzhu.com/channels/all'''
result = [sigle_jx(x) for x in set_all(url)]
df = pd.DataFrame(result)
file_name = str(now.month)+str(now.day)+str(now.hour)+str(now.minute)+str(now.second)
df.to_csv("csv_"+str(file_name)+"+.csv")
for x in df['imgs']:
for img in x:
num += 1
with open(file_name + str(num)+".png", "wb") as o:
o.write(open_url(img[0]))
直接解析URL即可, 无任何难点; 脚本执行
crontab -e
*/30 * * * * /usr/bin/python3 ~/longzhu/spyder.py
天池排名爬取(无优化)
很久前写的, 直接贴上来
import urllib.request as ur
from bs4 import BeautifulSoup
import re
url = "https://tianchi.aliyun.com/competition/rankingList.htm?season=0&raceId=231602&pageIndex="
class SpyderTest():
def open_url(self, url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = ur.Request(url=url, headers=headers) # python2,urllib.request()
response = ur.urlopen(req) # python2,urllib2.urlopen()
return response.read().decode('utf-8')
def set_all(self, url):
data = self.open_url(url)
soup1 = BeautifulSoup(data, 'html.parser')
soup = soup1.findAll("li", attrs={"class": "list-item"})
res = [x for x in tuple(soup)]
return res
def main(self):
urls = [url + str(i + 1) for i in range(5)]
return [j for i_url in urls for j in self.set_all(i_url)]
def test(self, xx):
# element for class--rank
rank = xx.find("div", attrs={"class": "ranking"}).find("p").next_element
rank_ud = xx.find("div", attrs={"class": "ranking"}).find("span")
try:
up_down = re.findall('''.*<span class="(.*?)"><i></i><sup>(.*?)</sup></span>.*''', repr(rank_ud))[0]
if up_down[0] == "down":
rank_fo = -1*int(up_down[1])
else:
rank_fo = int(up_down[1])
except:
rank_fo = 0
# element for class-member
member = xx.find("div", attrs={"class": "member-box"})
team_name = member.find("p").next_element
menber_href = re.findall('''.*<a href="(.*?)" target="_blank">(.*?)</a>.*''', repr(member))
groups = xx.find("div", attrs={"class": "team-box"}).findAll("p")[1]
group = groups.next_element
# element for class style&score&dt
score = xx.find("div", attrs={"style": "width:105px"}).next_element
best_time = xx.find("div", attrs={"class": "best-time"}).next_element
## END
info = {}
info.setdefault("队伍名称", team_name)
info.setdefault("队伍名次", int(rank.split("\n")[0]))
info.setdefault("波动名次", rank_fo)
info.setdefault("所在组织", group)
info.setdefault("成员信息", menber_href)
info.setdefault("最近提交", best_time.split("\n")[1].replace(" ", ""))
info.setdefault("分数", score.split("\n")[1].replace(" ", ""))
return info
def run(self):
gaim = [self.test(xx) for xx in self.main()]
import pandas as pd
df = pd.DataFrame(gaim, columns=["队伍名称", "队伍名次", "成员信息", "分数", "波动名次",
"最近提交", "所在组织"])
df.to_csv("c://aaa.csv")
SpyderTest().run()