数据中的html标签不识别,美丽的汤条HTML标题标签，th类标签和检索数据不在列表中...-CSDN博客

因此，我创建了一个网页刮板，进入cfbstats.com/2014/player/index.html并检索所有大学橄榄球队和链接的足球队。从那里进入每个环节，并把名单和球员链接。最后，它会进入每个玩家的链接，并采取他的统计。美丽的汤条HTML标题标签，th类标签和检索数据不在列表中

我目前遇到球员统计问题。当我调用每个表格的标题时，我会得到打印的输出[Tackle]，当调用表格的第一行时，我会得到[G]。我想摆脱这些标签。我一直无法获得他们的过去职能。任何帮助，将不胜感激。

import csv

import sys

import json

import urllib

import requests

from bs4 import BeautifulSoup

import xlrd

import xlwt

def getCollegeandURL():

f = open('colleges.csv', 'w')

f.write("Teams" + "," + "," + "URL" + '\n')

originalurl = "http://www.cfbstats.com/2014/player/index.html"

base = requests.get("http://www.cfbstats.com/2014/player/index.html")

base = base.text

soup = BeautifulSoup(base)

# this is to find all the colleges in the div conference

mydivs = soup.find_all('div',{'class': 'conference'})

##g is an excel document for the roster

g = open('rosters.csv', 'w')

g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n')

# h is an excel for each player stats

h = xlwt.Workbook()

# this for loop finds writes each college to a line

for div in mydivs:

urls= div.findAll('a')

# this is to pull all the college names and each of their links

for url in urls:

college = url.text

url = url.attrs['href']

teamurl = originalurl[:23]+url

f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')

scrapeRosters(college, teamurl, g, h)

############################################################################

def scrapeRosters(college, teamurl, g, h):

# create the excel documents

# this gets the pages of teams

roster = requests.get(teamurl)

roster = roster.text

roster = BeautifulSoup(roster)

teamname = roster.find_all('h1' , {'id': 'pageTitle'})

teamAndPlayers = {}

table = roster.find_all('table', {'class' : 'team-roster'})

for i in table:

rows = i.find_all('tr')

for row in rows[1:]:

data = [str(i.getText()) for i in row('td')]

link = row('td')[1]('a')

if len(link) > 0:

link = str(link[0]['href'])

data = [str(link)] + data

# unpacking data into variables

(playerurl, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data

# creating the full player url

playerurl = teamurl[:23] + playerurl

# repacking the data

data = (college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool)

g.write(college + ',' + playernumber + ',' + playerName + ',' + playerPosition + ','+ YearinCollege + ',' + playerHeight + ',' + playerWeight + ',' + playerHometown + ',' + lastSchool+ ',' + ',' + playerurl + ',' + '\n')

playerStats(data, playerurl, h)

############################################################################

def playerStats(data,playerurl, h):

playerurl = requests.get(playerurl)

playerurl = playerurl.text

playerurl = BeautifulSoup(playerurl)

tablestats = playerurl.find_all('table', {'class' : 'player-home'})

(college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data

#print college, playernumber, playerName

print college, playerName, playernumber

for x in tablestats:

caption = x.find_all('caption')