TowardsDataScience 2023 博客中文翻译（三十九）

: · 测量

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Read the data:
URL = 'https://bit.ly/3NLoDzx'
df = pd.read_csv(URL, skiprows=[1])

# Remove leading whitespace from the column names:
df.columns = df.columns.str.strip()

# Drop unnecessary columns:
df = df.drop(df.columns[[4, 5]], axis=1)

# Group by monthly MINIMUM ice extent:
df = df.groupby(['Year', 'Month']).agg({'Extent': ['min']}).reset_index()

# Create a 'date' column from the 'Year' and 'Month' columns:
cols = ['Year', 'Month']
df['date'] = df[cols].apply(lambda x: '-'.join(x.values.astype(str)), 
                            axis="columns")
df['date'] = pd.to_datetime(df['date'])

# Set the 'date' column as the DataFrame index:
df = df.set_index(df['date'])

# Drop unnecessary year, month, and date columns:
df = df.drop(df.columns[[0, 1, 3]], axis=1)

# Calculate the yearly moving average:
df['yearly_ma'] = df.Extent.rolling(12).mean()

# Check the results:
df.tail(3)

# Create the plot:
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title('Arctic Sea Ice Monthly MINIMUM Extent', size=15)
ax.plot(df['Extent'], lw=2)
ax.plot(df['yearly_ma'], color='k')
ax.set_ylim([0, 20])
ax.tick_params(axis='both', 
               which='major', 
               labelsize=12)
ax.grid()

# Add a legend:
ax.legend(['Ice Extent (10⁶ sq km)', 'Yearly Moving Ave'],
           frameon=True,
           loc=3,
           prop={'size': 14},
           facecolor='#a1c9f4',
           edgecolor='k',
           fancybox=True,
           shadow=True,
           framealpha=1)

# Add a shaded span for Gore's prediction:
ax.axvspan(*mdates.datestr2num(['2009-12-14', '2016-1-1']), 
           color='red', 
           alpha=0.3)

# Annotate the 7-year span referenced by Gore in 2009:
ax.text(0.655, 0.8, 
        "Gore's Next 7 Years", 
        transform=ax.transAxes, 
        fontsize=14)

# Set the x and y labels:
font1 = {'family': 'arial', 
         'color': 'black', 
         'size': 15}
ax.set_xlabel('Year', fontdict=font1)
ax.set_ylabel('Arctic Sea Ice Extent (10⁶ sq km)', 
              fontdict=font1)

plt.show()

DEFINE
  VAR __DS0FilterTable = 
    FILTER(
        KEEPFILTERS(VALUES('Date'[YearIndex]))
          ,AND('Date'[YearIndex] >= -3, 'Date'[YearIndex] <= 0)
          )

VAR __DS0Core = 
  SUMMARIZECOLUMNS(
              'Geography'[ContinentName]
              ,'Geography'[RegionCountryName]
              ,__DS0FilterTable
              ,"Sum_Online_Sales", 'All Measures'[Sum Online Sales]
            )

EVALUATE
  __DS0Core

ORDER BY
  'Geography'[ContinentName]
  ,'Geography'[RegionCountryName]

DEFINE
  VAR __DS0FilterTable =
    FILTER (
        KEEPFILTERS ( VALUES ( 'Date'[YearIndex] ) ),
            AND ( 'Date'[YearIndex] >= -3, 'Date'[YearIndex] <= 0 )
          )

EVALUATE
  SELECTCOLUMNS ('Geography'
                ,'Geography'[ContinentName]
                ,'Geography'[RegionCountryName]
                ,"Sum_Online_Sales", [Sum Online Sales]
              )

EVALUATE
  ADDCOLUMNS('Geography'
              ,"Sum_Online_Sales", [Sum Online Sales]
            )

EVALUATE
  SUMMARIZE( 'Geography'
            ,'Geography'[ContinentName]
            ,'Geography'[RegionCountryName]
            ,"Sum_Online_Sales", [Sum Online Sales]
          )

DEFINE
  VAR __DS0FilterTable =
    FILTER (
      KEEPFILTERS ( VALUES ( 'Date'[YearIndex] ) ),
          AND ( 'Date'[YearIndex] >= -3, 'Date'[YearIndex] <= 0 )
          )

EVALUATE
  SUMMARIZECOLUMNS(
            'Geography'[ContinentName]
            ,'Geography'[RegionCountryName]
            ,__DS0FilterTable
            ,"Sum_Online_Sales", [Sum Online Sales]
          )
ORDER BY 'Geography'[ContinentName]
  ,'Geography'[RegionCountryName]

EVALUATE
  CALCULATETABLE(SUMMARIZECOLUMNS(
                      'Geography'[ContinentName]
                      ,'Geography'[RegionCountryName]
                      ,"Sum_Online_Sales", [Sum Online Sales]
                  )
                  ,'Date'[Year] = 2023
              )
ORDER BY 'Geography'[ContinentName]
  ,'Geography'[RegionCountryName]

DEFINE
  VAR __DS0FilterTable =
    FILTER (
      KEEPFILTERS ( VALUES ( 'Date'[YearIndex] ) ),
        AND ( 'Date'[YearIndex] >= -3, 'Date'[YearIndex] <= 0 )
        )

EVALUATE
  ADDCOLUMNS(
      SUMMARIZE('Geography'
                ,'Geography'[ContinentName]
                ,'Geography'[RegionCountryName]
              )
              ,"Sum_Online_Sales", CALCULATE([Sum Online Sales]
              ,KEEPFILTERS(__DS0FilterTable)
            )
      )
ORDER BY 'Geography'[ContinentName]
  ,'Geography'[RegionCountryName]

DEFINE
VAR __DS0FilterTable =
  FILTER (
    KEEPFILTERS ( VALUES ( 'Date'[YearIndex] ) ),
      AND ( 'Date'[YearIndex] >= -3, 'Date'[YearIndex] <= 0 )
       )

EVALUATE
  CALCULATETABLE(
    ADDCOLUMNS(
      SUMMARIZE('Geography'
                ,'Geography'[ContinentName]
                ,'Geography'[RegionCountryName]
                )
          ,"Sum_Online_Sales", [Sum Online Sales]
          )
          ,__DS0FilterTable
       )

ORDER BY 'Geography'[ContinentName]
  ,'Geography'[RegionCountryName]

import urllib.request
search_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/esearch.fcgi/' + \
              f'?db=pubmed' + \
              f'&term=myoglobin[mesh]' + \
              f'&mindate=2022' + \
              f'&maxdate=2023' + \
              f'&retmode=json' + \
              f'&retmax=50'

link_list = urllib.request.urlopen(search_url).read().decode('utf-8')
link_list

import json
result = json.loads( link_list )
id_list = ','.join( result['esearchresult']['idlist'] )

summary_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/esummary.fcgi?db=pubmed&id={id_list}&retmode=json'

summary_list = urllib.request.urlopen(summary_url).read().decode('utf-8')

summary = json.loads( summary_list )
summary['result']['37047528']

uid = [ x for x in summary['result'] if x != 'uids' ]
journals = [ summary['result'][x]['fulljournalname'] for x in summary['result'] if x != 'uids' ]
titles = [ summary['result'][x]['title'] for x in summary['result'] if x != 'uids' ]
first_authors = [ summary['result'][x]['sortfirstauthor'] for x in summary['result'] if x != 'uids' ]
last_authors = [ summary['result'][x]['lastauthor'] for x in summary['result'] if x != 'uids' ]
links = [ summary['result'][x]['elocationid'] for x in summary['result'] if x != 'uids' ]
pubdates = [ summary['result'][x]['pubdate'] for x in summary['result'] if x != 'uids' ]

links = [ re.sub('doi:\s','http://dx.doi.org/',x) for x in links ]
results_df = pd.DataFrame( {'ID':uid,'Journal':journals,'PublicationDate':pubdates,'Title':titles,'URL':links,'FirstAuthor':first_authors,'LastAuthor':last_authors} )

'**uid**','**pubdate**','**epubdate**','**source**','**authors**','**lastauthor**','**title**',
'**sorttitle**','**volume**','**issue**','**pages**','**lang**','**nlmuniqueid**','**issn**',
'**essn**','**pubtype**','**recordstatus**','**pubstatus**','**articleids**','**history**',
'**references**','**attributes**','**pmcrefcount**','**fulljournalname**','**elocationid**',
'**doctype**','**srccontriblist**','**booktitle**','**medium**','**edition**',
'**publisherlocation**','**publishername**','**srcdate**','**reportnumber**',
'**availablefromurl**','**locationlabel**','**doccontriblist**','**docdate**',
'**bookname**','**chapter**','**sortpubdate**','**sortfirstauthor**','**vernaculartitle**'

from bs4 import BeautifulSoup
import lxml
import pandas as pd

abstract_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/efetch.fcgi?db=pubmed&id={id_list}'
abstract_ = urllib.request.urlopen(abstract_url).read().decode('utf-8')
abstract_bs = BeautifulSoup(abstract_,features="xml")

articles_iterable = abstract_bs.find_all('PubmedArticle')

# Abstracts
abstract_texts = [ x.find('AbstractText').text for x in articles_iterable ]

# Conflict of Interest statements
coi_texts = [ x.find('CoiStatement').text if x.find('CoiStatement') is not None else '' for x in articles_iterable ]

# MeSH terms
meshheadings_all = list()
for article in articles_iterable:
  result = article.find('MeshHeadingList').find_all('MeshHeading')
  meshheadings_all.append( [ x.text for x in result ] )

# ReferenceList
references_all = list()
for article in articles_:
  if article.find('ReferenceList') is not None:
    result = article.find('ReferenceList').find_all('Citation')
    references_all.append( [ x.text for x in result ] )
  else:
    references_all.append( [] )

results_table = pd.DataFrame( {'COI':coi_texts, 'Abstract':abstract_texts, 'MeSH_Terms':meshheadings_all, 'References':references_all} )

efetch_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/efetch.fcgi?db=pubmed&id={id_list}'
efetch_result = urllib.request.urlopen( efetch_url ).read().decode('utf-8')
efetch_bs = BeautifulSoup(efetch_result,features="xml")

tags = efetch_bs.find_all()

for tag in tags:
  print(tag)

import urllib.request
import json

id_ = '37055458'
elink_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/elink.fcgi?db=pubmed&id={id_}&retmode=json&cmd=neighbor_score'
elinks = urllib.request.urlopen(elink_url).read().decode('utf-8')

elinks_json = json.loads( elinks )

ids_=[];score_=[];
all_links = elinks_json['linksets'][0]['linksetdbs'][0]['links']
for link in all_links:
  [ (ids_.append( link['id'] ),score_.append( link['score'] )) for id,s in link.items() ]

pd.DataFrame( {'id':ids_,'score':score_} ).drop_duplicates(['id','score'])

id_ = '37055458'
elink_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/elink.fcgi?db=pubmed&id={id_}&retmode=json&cmd=prlinks'
elinks = urllib.request.urlopen(elink_url).read().decode('utf-8')

elinks_json = json.loads( elinks )

[ x['url']['value'] for x in elinks_json['linksets'][0]['idurllist'][0]['objurls'] ]

id_list = '37055458,574140'
elink_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/elink.fcgi?db=pubmed&id={id_list}&retmode=json&cmd=prlinks'
elinks = urllib.request.urlopen(elink_url).read().decode('utf-8')

elinks_json = json.loads( elinks )

elinks_json
urls_ = elinks_json['linksets'][0]['idurllist']
for url_ in urls_:
  [ print( url_['id'], x['url']['value'] ) for x in url_['objurls'] ]

import pandas as pd
import time
from bs4 import BeautifulSoup
import seaborn as sns
from matplotlib import pyplot as plt
import itertools
from collections import Counter
from numpy import array_split
from urllib.request import urlopen

class Searcher:
    # Any instance of searcher will search for the terms and return the number of results on a per year basis #
    def __init__(self, start_, end_, term_, **kwargs):
        self.raw_ = input
        self.name_ = 'searcher'
        self.description_ = 'searcher'
        self.duration_ = end_ - start_
        self.start_ = start_
        self.end_ = end_
        self.term_ = term_
        self.search_results = list()
        self.count_by_year = list()
        self.options = list()

        # Parse keyword arguments

        if 'count' in kwargs and kwargs['count'] == 1:
            self.options = 'rettype=count'

        if 'retmax' in kwargs:
            self.options = f'retmax={kwargs["retmax"]}'

        if 'run' in kwargs and kwargs['run'] == 1:
            self.do_search()
            self.parse_results()

    def do_search(self):
        datestr_ = [self.start_ + x for x in range(self.duration_)]
        options = "".join(self.options)
        for year in datestr_:
            this_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/esearch.fcgi/' + \
                       f'?db=pubmed&term={self.term_}' + \
                       f'&mindate={year}&maxdate={year + 1}&{options}'
            print(this_url)
            self.search_results.append(
                urlopen(this_url).read().decode('utf-8'))
            time.sleep(.33)

    def parse_results(self):
        for result in self.search_results:
            xml_ = BeautifulSoup(result, features="xml")
            self.count_by_year.append(xml_.find('Count').text)
            self.ids = [id.text for id in xml_.find_all('Id')]

    def __repr__(self):
        return repr(f'Search PubMed from {self.start_} to {self.end_} with search terms {self.term_}')

    def __str__(self):
        return self.description

# Create a list which will contain searchers, that retrieve results for each of the search queries
searchers = list()
searchers.append(Searcher(2022, 2023, 'CEO[cois]+OR+CTO[cois]+OR+CSO[cois]', run=1, retmax=10000))
searchers.append(Searcher(2021, 2022, 'CEO[cois]+OR+CTO[cois]+OR+CSO[cois]', run=1, retmax=10000))
searchers.append(Searcher(2020, 2021, 'CEO[cois]+OR+CTO[cois]+OR+CSO[cois]', run=1, retmax=10000))
searchers.append(Searcher(2019, 2020, 'CEO[cois]+OR+CTO[cois]+OR+CSO[cois]', run=1, retmax=10000))
searchers.append(Searcher(2018, 2019, 'CEO[cois]+OR+CTO[cois]+OR+CSO[cois]', run=1, retmax=10000))

# Create a dictionary to store keywords for all articles from a particular year
keywords_dict = dict()

# Each searcher obtained results for a particular start and end year
# Iterate over searchers
for this_search in searchers:

    # Split the results from one search into batches for URL formatting
    chunk_size = 200
    batches = array_split(this_search.ids, len(this_search.ids) // chunk_size + 1)

    # Create a dict key for this searcher object based on the years of coverage
    this_dict_key = f'{this_search.start_}to{this_search.end_}'

    # Each value in the dictionary will be a list that gets appended with keywords for each article
    keywords_all = list()

    for this_batch in batches:
        ids_ = ','.join(this_batch)

        # Pull down the website containing XML for all the results in a batch
        abstract_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/efetch.fcgi?db=pubmed&id={ids_}'

        abstract_ = urlopen(abstract_url).read().decode('utf-8')
        abstract_bs = BeautifulSoup(abstract_, features="xml")
        articles_iterable = abstract_bs.find_all('PubmedArticle')

        # Iterate over all of the articles from the website
        for article in articles_iterable:
            result = article.find_all('Keyword')
            if result is not None:
                keywords_all.append([x.text for x in result])
            else:
                keywords_all.append([])

        # Take a break between batches!
        time.sleep(1)

    # Once all the keywords are assembled for a searcher, add them to the dictionary
    keywords_dict[this_dict_key] = keywords_all

    # Print the key once it's been dumped to the pickle
    print(this_dict_key)

# Limit to words that appeared approx five times or more in any given year

mapping_ = {'2018to2019':2018,'2019to2020':2019,'2020to2021':2020,'2021to2022':2021,'2022to2023':2022}
global_word_list = list()

for key_,value_ in keywords_dict.items():
  Ntitles = len( value_ )
  flattened_list = list( itertools.chain(*value_) )

  flattened_list = [ x.lower() for x in flattened_list ]
  counter_ = Counter( flattened_list )
  words_this_year = [ ( item , frequency/Ntitles , mapping_[key_] ) for item, frequency in counter_.items() if frequency/Ntitles >= .005 ]
  global_word_list.extend(words_this_year)

# Plot results as clustermap

global_word_df = pd.DataFrame(global_word_list)
global_word_df.columns = ['word', 'frequency', 'year']
pivot_df = global_word_df.loc[:, ['word', 'year', 'frequency']].pivot(index="word", columns="year",
                                                                    values="frequency").fillna(0)

pivot_df.drop('covid-19', axis=0, inplace=True)
pivot_df.drop('sars-cov-2', axis=0, inplace=True)

sns.set(font_scale=0.7)
plt.figure(figsize=(22, 2))
res = sns.clustermap(pivot_df, col_cluster=False, yticklabels=True, cbar=True)

tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
model = AutoModelForTokenClassification.from_pretrained(
    "yanekyuk/bert-uncased-keyword-extractor"
)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

def extract_keywords(text):
    """
    Extract keywords and construct them back from tokens
    """
    result = list()
    keyword = ""
    for token in nlp(text):
        if token['entity'] == 'I-KEY':
            keyword += token['word'][2:] if \
              token['word'].startswith("##") else f" {token['word']}"
        else:
            if keyword:
                result.append(keyword)
            keyword = token['word']
    # Add the last keyword
    result.append(keyword)
    return list(set(result))

extract_keywords("""
Broadcom agreed to acquire cloud computing company VMware in a $61 billion (€57bn) cash-and stock deal.
""") # ['cloud computing', 'vmware', 'broadcom']

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def generate_embeddings(text):
    embeddings = model.encode(text)
    return [float(x) for x in embeddings.tolist()]

gds.run_cypher("""
CALL apoc.meta.stats()
YIELD labels, relTypesCount
""")

gds.run_cypher("""
MATCH (p:Page)
RETURN p.has_text AS has_text,
       count(*) AS count
""")

gds.run_cypher("""
MATCH (p:Page)
WHERE p.has_text IS NULL
RETURN p.url AS page,
       count{(p)<-[:LINKS_TO|REDIRECTS]-()} AS links
ORDER BY links DESC
LIMIT 5
""")

gds.run_cypher("""
MATCH (:Page)-[:LINKS_TO|REDIRECTS]->(:Page{is_404:true})
RETURN count(*) AS brokenLinkCount
""")

gds.run_cypher("""
MATCH (start:Page {url:"https://neo4j.com/docs"}), 
      (end:Page {url:"https://console.neo4j.io"})
MATCH p=shortestPath((start)-[:LINKS_TO|REDIRECTS*..10]->(end))
RETURN [n in nodes(p) | n.url] AS path
""")

G, metadata = gds.graph.project('structure', 'Page', 
  ['LINKS_TO', 'REDIRECTS'])

df = gds.degree.stream(G, orientation="REVERSE")
df["url"] = [d["url"] for d in gds.util.asNodes(df["nodeId"].to_list())]
df.sort_values("score", ascending=False, inplace=True)
df.head()

pr_df = gds.pageRank.stream(G)
pr_df["pagerank"] = pr_df.pop("score")
combined_df = df.merge(pr_df, on="nodeId")
combined_df.sort_values("pagerank", ascending=False, inplace=True)

gds.run_cypher("""
MATCH (k:Keyword)
RETURN k.name AS keyword,
       count {(k)<-[:HAS_KEYWORD]-()} AS mentions
ORDER BY mentions DESC
LIMIT 5
""")

gds.run_cypher("""
MATCH (p:Page)-[:HAS_KEYWORD]->(k:Keyword)
WHERE p.url CONTAINS "graph-data-science"
RETURN k.name AS keyword,
       count(*) AS mentions
ORDER BY mentions DESC
LIMIT 5
""")

G, metadata = gds.graph.project(
    "keywords", ["Page", "Keyword"], {"HAS_KEYWORD": {"orientation": "REVERSE"}}
)

gds.nodeSimilarity.mutate(
    G, mutateRelationshipType="CO_OCCUR", mutateProperty="score", 
    similarityCutoff=0.4
)

topic_df = gds.louvain.stream(G, nodeLabels=["Keyword"], relationshipTypes=["CO_OCCUR"])
topic_df["keyword"] = [
    n["name"] for n in gds.util.asNodes(topic_df["nodeId"].to_list())
]
topic_df.groupby("communityId").agg(
    {"keyword": ["size", list]}
).reset_index().sort_values([("keyword", "size")], ascending=False).head()

import numpy as np
from scipy.special import betaln as logbeta

def prob_b_beats_a(n_wins_a: int, 
                   n_losses_a: int, 
                   n_wins_b: int, 
                   n_losses_b: int) -> float:

  alpha_a = n_wins_a + 1
  beta_a = n_losses_a + 1

  alpha_b = n_wins_b + 1
  beta_b = n_losses_b + 1
  probability = 0.0
  for i in range(alpha_b):
    total += np.exp(
      logbeta(alpha_a + i, beta_b + beta_a)
      - np.log(beta_b + i)
      - logbeta(1 + i, beta_b)
      - logbeta(alpha_a, beta_a)
    )
  return probability

pip install statsbombpy

import matplotlib
import matplotlib.pyplot as plt
from mplsoccer import VerticalPitch
import numpy as np
import pandas as pd
from statsbombpy import sb

competition_row = sb.competitions()[
    (sb.competitions()['competition_name'] == 'La Liga') 
    & (sb.competitions()['season_name'] == '2015/2016')
]
competition_id = pd.unique(
    competition_row['competition_id']
)[0]
season_id = pd.unique(
    competition_row['season_id']
)[0]

matches = sb.matches(competition_id=competition_id, season_id=season_id)

match_events = sb.events(match_id=match_id)

shots_against_team.head()

# Set up pitch (layout)
pitch = VerticalPitch(line_zorder=2, line_color='black', half = True)
fig, axs = pitch.grid(nrows=1, ncols=1, axis=False, endnote_height=0.05)

# Plot each shot
for row in shots_against_team.itertuples():
    if row.shot_outcome == 'Goal':
        # If it was a goal, we want to see it clearly
        alpha = 1
    else:
        # Increase transparency if it wasn't a goal
        alpha = 0.2
    pitch.scatter(
        row.x, 
        row.y, 
        alpha = alpha, 
        s = 100, 
        color = "red", 
        ax=axs['pitch'],
        edgecolors="black"
    )

pitch = VerticalPitch(line_zorder=2, line_color='black', half = True)
fig, axs = pitch.grid(nrows=1, ncols=2, axis=False, endnote_height=0.05)

shot_bin_statistic = pitch.bin_statistic(
    shots_against_team.x, 
    shots_against_team.y, 
    statistic='count', 
    bins=(6, 5), 
    normalize=False
)
#normalize by number of games
shot_bin_statistic["statistic"] = shot_bin_statistic["statistic"]/len(team_matches)
#make a heatmap
pcm  = pitch.heatmap(shot_bin_statistic, cmap='Reds', edgecolor='grey', ax=axs['pitch'][0])
#legend to our plot
ax_cbar = fig.add_axes((-0.05, 0.093, 0.03, 0.786))
cbar = plt.colorbar(pcm, cax=ax_cbar)
axs['pitch'][0].set_title('Shots conceded heatmap')

fig.suptitle(f"Shots and Goals Against {team} in 2015/16 La Liga season", fontsize = 30)

# Count goals per heatmap bin
goal_bin_statistic = pitch.bin_statistic(
    shots_against_team.loc[shots_against_team['shot_outcome'] == 'Goal'].x, 
    shots_against_team.loc[shots_against_team['shot_outcome'] == 'Goal'].y, 
    statistic='count', 
    bins=(6, 5), 
    normalize=False
# Count shots per heatmap bin
shot_bin_statistic = pitch.bin_statistic(
    shots_against_team.x, 
    shots_against_team.y, 
    statistic='count', 
    bins=(6, 5), 
    normalize=False
)

# Create goal_shot_ratio KPI by dividing goals/shots
goal_shot_ratio = goal_bin_statistic.copy()
goal_shot_ratio['statistic'] = np.divide(goal_bin_statistic['statistic'], shot_bin_statistic['statistic'])
goal_shot_ratio['statistic'] = np.nan_to_num(goal_shot_ratio['statistic'])

all_lineups = None
for match_id in pd.unique(all_events['match_id']):
    match_lineups = sb.lineups(match_id=match_id)['Barcelona']

    match_lineups['match_id'] = match_id
    match_lineups['match_duration'] = all_events[all_events['match_id'] == match_id]['minutes'].unique()[0]
    match_lineups['from'] = match_lineups['positions'].apply(lambda x: x[0]['from'] if x else np.nan)
    match_lineups['to'] = match_lineups.apply(lambda x: x['positions'][-1]['to'] if x['positions'] and x['positions'][-1]['to'] is not None else ('90:00' if x['positions'] else np.nan), axis=1)
    match_lineups['minutes_played'] = match_lineups.apply(lambda x: parse_positions(x['positions'], x['match_duration']), axis=1)

    if all_lineups is None:
        all_lineups = match_lineups.copy()
    else:
        all_lineups = pd.concat([all_lineups, match_lineups], join="inner")

all_lineups = all_lineups.reset_index(drop=True)

 for match_id in pd.unique(all_lineups['match_id']):
    match_shots = shots_against_team[
      shots_against_team['match_id'] == match_id
    ]
    for player_tup in all_lineups[all_lineups['match_id'] == match_id].itertuples():

        # For whatever reason, the 'from' column is being mapped to '_10'
        shots_conceded = match_shots[
          (match_shots['time'] >= player_tup._10) 
          & (match_shots['time'] <= player_tup.to)
        ]
        goals_conceded = len(
          shots_conceded[shots_conceded['shot_outcome'] == 'Goal']
        )
        shots_conceded = len(shots_conceded)

        all_lineups.at[player_tup.Index,'shots_conceded'] = shots_conceded
        all_lineups.at[player_tup.Index,'goals_conceded'] = goals_conceded

grouped = all_lineups.groupby('player_id')[
    ['minutes_played', 'shots_conceded', 'goals_conceded']
].sum()

grouped['shots_per_minute'] = \
    grouped['shots_conceded'] / grouped['minutes_played']

grouped['goals_per_minute'] = \
    grouped['goals_conceded'] / grouped['minutes_played']

Barcelona conceded 356 shots. An average of 9.37 shots per match.
Barcelona conceded 26 goals. An average of 0.68 goals per match.

**Thanks for reading the post!** 

I really hope you enjoyed it and found it insightful.

Follow me and subscribe to my mail list for more 
content like this one, it helps a lot!

**@polmarin**

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import pysal
import splot
import re
import seaborn as sns
import folium

# For points map
import geoplot.crs as gcrs
import geoplot as gplt

# Open listings file
listings = pd.read_csv('/content/listings.csv',
                       usecols=['id', 'property_type', 'neighbourhood_cleansed',
                                'bedrooms', 'beds', 'bathrooms_text', 'price',
                                'latitude','longitude'])
#listings.columns
listings.sample(4)

# Correct Price to Float (Replace $ and , with nothing)
listings['price'] = (listings['price']
                     .replace("[$,]", "", regex=True)
                     .astype(float)
                     )

#price stats
listings.price.describe()

count    3239.000000
mean      179.771843
std       156.068212
min        14.000000
25%        95.000000
50%       135.000000
75%       212.500000
max      2059.000000
Name: price, dtype: float64

# Check price range
sns.displot(listings['price'], kde=True);

# Convert Pandas df to Geopandas df
listings_gpd = gpd.GeoDataFrame(listings,
                                     geometry=gpd.points_from_xy(listings.longitude, listings.latitude, crs=4326))

# Look at the geometry variable created
listings_gpd.head(2)

# Points map using geoplot
ax = gplt.webmap(listings_gpd.query('price < 538'), projection=gcrs.WebMercator())
gplt.pointplot(listings_gpd.query('price < 538'), ax=ax, hue= 'price', legend=True);

# Reading the Asheville polygon shapefile (geojson)
geofile = '/content/neighbourhoods.geojson'
asheville = gpd.read_file(geofile)
asheville = asheville.to_crs(4326)

# Heatmap
ax = gplt.kdeplot(listings_gpd,
          fill=True, cmap='Reds',
          clip=asheville.geometry,
          projection=gcrs.WebMercator())

# Plotting the heatmap on top of the geometry
gplt.polyplot(asheville, ax=ax, zorder=1);

from numpy import Inf
# Create clip levels for prices
listings['price_bins']= pd.cut(listings.price,
       bins= [-Inf, 100, 200, 300, 400, 500, Inf],
       labels= ['0-100', '100-200', '200-300', '300-400', '400-500', '500+'])

# Create bin colors
listings['colors'] = listings.price_bins.map({'0-100': 'lightblue', '100-200':'blue', '200-300':'gold', '300-400':'orange', '400-500':'red', '500+':'black'})

# Creating a base map
m = folium.Map(location= [35.5951, -82.5515], zoom_start=10)

# Adding the points
for lat, lon, ptcolor in zip(listings.latitude, listings.longitude, listings.colors):
  folium.CircleMarker(
     location=[lat, lon],
     radius=2,
     opacity=0.5,
     color=ptcolor,
     fill=True,
     fill_color=ptcolor,
  ).add_to(m)

from folium import plugins

# Preparing data for plot
data = listings[['latitude','longitude', 'price']].values
data =data.tolist()

# Create Heat Map with Folium
hm = plugins.HeatMap(data,gradient={0.1: 'blue', 0.2: 'lime', 0.4: 'yellow', 0.6: 'orange', 0.9: 'red'}, 
                min_opacity=0.1, 
                max_opacity=0.9, 
                radius=20,
                use_local_extrema=False)

# Add to base map
hm.add_to(m);

# Display
m

# Add a choropleth layer
folium.Choropleth(
    geo_data=asheville,
    name="choropleth",
    data=listings,
    columns=["neighbourhood", "price"],
    key_on="feature.properties.neighbourhood",
    fill_color="RdBu_r",
    fill_opacity=0.5,
    line_opacity=0.5,
    legend_name="Prices",
).add_to(m)
m

# Base map with Terrain mode
m = folium.Map(location= [35.5951, -82.5515], zoom_start=10, tiles="Stamen Terrain")

# Add a choropleth layer to a terrain map
folium.Choropleth(
    geo_data=asheville,
    name="choropleth",
    data=listings,
    columns=["neighbourhood", "price"],
    key_on="feature.properties.neighbourhood",
    fill_color="RdBu_r",
    fill_opacity=0.5,
    line_opacity=0.5,
    legend_name="Prices",
).add_to(m)
m

TowardsDataScience 2023 博客中文翻译（三十九）

分析工程学介绍

分析工程师是谁，他们应该做什么

从 ETL 到 ELT 的过渡

提取

转换

加载

ETL: 提取 → 转换 → 加载

ELT: 提取 → 加载 → 转换

数据分析工程

dbt：分析工程的终极工具

最后的想法…

使用 Python 分析北极冰趋势

探索过去的预测

关于气候变化的评论

国家雪冰数据中心

代码

导入库

加载和准备数据

绘制数据

结果

引用

谢谢！

分析在 Power BI 和 DAX 查询中聚合数据的性能

我们在 Power BI 中经常聚合数据。有时我们需要手动查询数据模型，或者在度量中需要中间表。让我们看看如何做到这一点。

介绍

基础查询

SELECTCOLUMNS() 和 ADDCOLUMNS()

SUMMARIZE()

SUMMARIZECOLUMNS()

CALCULATETABLE()

组合函数

结论

参考文献

有时我们会遇到运行缓慢的报告，需要找出原因。我们将学习如何收集性能数据以及……

阅读 Salvatore Cagliari 的每一个故事（以及 Medium 上成千上万其他作者的故事）。你的会员费用直接…

用 E-utilities 和 Python 分析科学出版物

如何收集科学文献数据并发现趋势

NCBI 可以回答哪些数据科学问题？

查询 NCBI 数据库

四个最有用的 E-utilities

搜索字段

查询参数

使用 Python 执行查询并存储结果

使用 ESummary 来返回关于出版物的信息。

当你需要摘要、关键词和其他细节（仅限 XML 输出）时，请使用 EFetch。

使用 ELink 检索相似出版物和全文链接

示例数据可视化：来自 C-suite 作者的科学出版物

结论

参考文献

使用自然语言处理和知识图谱分析您的网站

结合各种自然语言处理技术，构建一个表示您网站的知识图谱

数据收集和建模工作流程

网络分析

总结

分析加州电动汽车的采纳率

使用 DMV 数据与 Pandas 和 GeoPandas

数据

过程

技术方法（GeoPandas 教程）

重命名 zip_code 几何列

创建副本县几何列

计算邮政编码质心

将邮政编码的活动几何设置为质心列

电动车采纳率 2022

结果

分析 Chess960 数据

使用超过 1400 万局 Chess960 游戏来找出是否有比其他变体更好的变体

介绍

简而言之

数据

数学框架

贝叶斯 A/B 测试

家庭错误率

列车/测试分割

数据准备

结果

生成假设

过去与未来的表现

评估与率