USE_CUDA = False
VERBOSE = False

if USE_CUDA:
    import cudf

    %load_ext cudf.pandas
import pandas as pd
import importlib
import workbench.src.data_loader as data_loader
import workbench.src.graph as graph
import workbench.src.data_process as data_process
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import src.feature_select as feature_select
from IPython.core.display import Markdown
from workbench.utils import utils

pd.set_option("max_colwidth", 8000)
pd.options.display.max_rows = 1000
pd.set_option("display.width", 500)
pd.set_option("display.max_colwidth", 5000)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", -1)
pd.set_option("display.max_columns", 200)

/tmp/ipykernel_17118/1998127023.py:7: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.
  pd.set_option("display.max_colwidth", -1)

utils.pandas_config(pd)
utils.plt_config(plt)

sns.set_theme(style="darkgrid", palette="pastel")
plt.style.use("fivethirtyeight")

importlib.reload(data_loader)
data = data_loader.load_data(inc_players=True)

importlib.reload(data_process)
rolling_stats: data_process.RollingMatchTeamStats = (
    data_process.append_rolling_match_team_stats(data)
)
dual_df = rolling_stats.dual_df
importlib.reload(data_process)
full_df = data_process.add_team_strategies(
    source_df=dual_df, team_attrs_df=data.team_attrs_df
)

if VERBOSE:
    data.matches_df

data.matches_df["total_goals"] = (
        data.matches_df["home_team_goal"] + data.matches_df["away_team_goal"]
)
league_matches_by_year = (
    data.matches_df.groupby(["league_name", "season_start_year"])
    .size()
    .unstack(fill_value=0)
)
median_goals_per_league = (
    data.matches_df.groupby(["league_name", "season_start_year"])["total_goals"]
    .mean()
    .reset_index()
)
median_goals_all_leagues = (
    data.matches_df.groupby("season_start_year")["total_goals"].mean().reset_index()
)

if VERBOSE:
    league_matches_by_year.dtypes

fig = plt.figure(figsize=(21, 18))
ax1 = plt.subplot2grid((3, 2), (0, 0), rowspan=2, fig=fig)
ax2 = plt.subplot2grid((3, 2), (0, 1), fig=fig)
ax3 = plt.subplot2grid((3, 2), (1, 1), fig=fig)
palette = plt.cm.tab10.colors
unique_leagues = data.matches_df["league_name"].unique()
league_color_mapping = {
    league: palette[i % len(palette)] for i, league in enumerate(unique_leagues)
}

sns.boxenplot(
    ax=ax1,
    data=data.matches_df,
    y="league_name",
    x="total_goals",
    color="b",
    linewidth=0.75,
    palette=league_color_mapping
    # order=clarity_ranking,
    # width_method="linear",
)

ax1.set_title("Distribution of Goals per Match by League")
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)

# Annotate each mean line with rounded mean goals value
medians = data.matches_df.groupby("league_name")["total_goals"].median()
means = data.matches_df.groupby("league_name")["total_goals"].mean()
for i, league in enumerate(medians.index):
    mean_value = round(means[league], 2)
    ax1.text(
        medians[league] + 0.2,
        i,
        f"{mean_value}",
        horizontalalignment="center",
        verticalalignment="center",
        size="small",
        color="black",
        weight="semibold",
        rotation=270,
    )

# league_matches_by_year.T.plot(kind="bar", stacked=True, ax=ax2, legend=False)
league_matches_by_year.T.plot(
    kind="bar",
    stacked=True,
    ax=ax2,
    legend=False,
    color=[league_color_mapping[league] for league in league_matches_by_year.index],
)

ax2.set_title("Number of Matches per Season for Each League")
ax2.set_ylabel("Number of Matches (mult. by player count)")
ax2.set_ylim([0, 5000])

sns.lineplot(
    ax=ax3,
    data=median_goals_per_league,
    x="season_start_year",
    y="total_goals",
    hue="league_name",
    palette=league_color_mapping,
    legend=False,
    alpha=0.5,
    linewidth=1,
)
sns.lineplot(
    ax=ax3,
    data=median_goals_all_leagues,
    x="season_start_year",
    y="total_goals",
    color="black",
    linewidth=2,
)
ax1.set_ylabel("League")
ax1.set_xlabel("Gals (mean)")

ax2.set_xlabel("Season")

ax3.set_ylabel("Median Number of Goals")
ax3.set_xlabel("Season")
ax3.set_title("Median Number of Goals per League and Overall Median")
ax3.set_ylim([2.01, 3.99])

plt.tight_layout()
plt.show()

/tmp/ipykernel_17118/4035172131.py:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxenplot(
/tmp/ipykernel_17118/4035172131.py:24: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)

importlib.reload(data_process)
goals_info_df = data_process.process_goal_info(data, full_df)

matches_df_short = data.matches_df_short.copy()
matches_df_short["total_goals"] = (
        data.matches_df_short["home_team_goal"] + data.matches_df_short["away_team_goal"]
)
goals_found_summary = (
    goals_info_df.groupby(["match_api_id"])["match_api_id"]
    .count()
    .to_frame()
    .rename(columns={"match_api_id": "found_goals"})
    .reset_index()
)

goals_verify = matches_df_short[
    ["match_api_id", "league_name", "home_team_goal", "away_team_goal", "total_goals"]
].merge(goals_found_summary, how="left", on="match_api_id")
goals_verify["found_goals"] = goals_verify["found_goals"].fillna(0)

if VERBOSE:
    display(
        Markdown(
            """Total Number of goal events ('found_goals') and goals based on match results"""
        )
    )
    goals_verify.groupby(["league_name"]).sum().sort_values(
        by="found_goals", ascending=False
    )

goal_type_legend_labels = {
    "n": "Normal",
    "p": "Penalty",
    "o": "Own",
}

# Calculate goal type proportions
goal_types = goals_info_df.groupby(["league_name", "goal_type"]).size()
goal_types = (
    goal_types.groupby(level=0, group_keys=False)
    .apply(lambda x: x / float(x.sum()))
    .reset_index(name="proportion")
)

# Calculate assist proportions
goals_info_df["assist_present"] = goals_info_df["assist_player_id"].notna()
assist_proportion = (
    goals_info_df.groupby("league_name")["assist_present"]
    .mean()
    .reset_index(name="assist_proportion")
)

# Merge the datasets
combined_df = pd.merge(goal_types, assist_proportion, on="league_name")

# Normalize data for stacked plotting
pivot_df = combined_df.pivot(
    index="league_name", columns="goal_type", values="proportion"
)
pivot_df = pivot_df.div(pivot_df.sum(axis=1), axis=0)

importlib.reload(graph)
graph.render_goal_type_plot(
    pivot_df=pivot_df,
    combined_df=combined_df,
    goal_type_legend_labels=goal_type_legend_labels,
)

# Explode 'all_players_in_game' to create player-match combinations
# goals data only include players who have scored goals.
player_league = data.matches_df.explode("all_players_in_game")[
    ["all_players_in_game", "league_name"]
]
player_league.rename(columns={"all_players_in_game": "player_id"}, inplace=True)

player_league_grouped = (
    player_league.groupby(["player_id", "league_name"])
    .size()
    .reset_index(name="matches_played")
)

player_goals = (
    goals_info_df.groupby(["scoring_player_id", "league_name"])
    .size()
    .reset_index(name="goals")
)

player_league_goals = pd.merge(
    player_league_grouped,
    player_goals,
    left_on=["player_id", "league_name"],
    right_on=["scoring_player_id", "league_name"],
    how="left",
).fillna(0)

player_league_goals.drop(columns="scoring_player_id", inplace=True)

player_league_goals = pd.merge(
    player_league_goals,
    data.player_df[["player_api_id", "player_name"]],
    left_on="player_id",
    right_on="player_api_id",
    how="left",
)
player_league_goals.drop(columns="player_api_id", inplace=True)

if VERBOSE:
    player_league_goals

thresholds_config = [
    ("top_1", "Top 1%", 0.01),
    ("top_5", "Top 1-5%", 0.05),
    ("top_50", "Top 5-10%", 0.1),
    ("remaining", "Remaining %", 1),
]

league_player_goals = (
    player_league_goals.groupby(["league_name", "player_id"])
    .agg({"goals": "sum"})
    .reset_index()
)

league_player_goals.sort_values(
    by=["league_name", "goals"], ascending=[True, False], inplace=True
)

league_player_goals["cumulative_goals"] = league_player_goals.groupby("league_name")[
    "goals"
].cumsum()
league_player_goals["player_rank"] = (
        league_player_goals.groupby("league_name").cumcount() + 1
)

total_goals_by_league = league_player_goals.groupby("league_name")["goals"].sum()
total_players_by_league = league_player_goals.groupby("league_name")[
    "player_id"
].nunique()

thresholds = total_players_by_league.apply(
    lambda x: pd.Series({key: int(val * x) for key, name, val in thresholds_config})
).astype(int)


def calculate_percentages(league):
    league_data = league_player_goals[league_player_goals["league_name"] == league]
    percentages = {}
    last_threshold = 0
    total_goals = total_goals_by_league[league]

    for key, label, val in thresholds_config:
        if key != "remaining":
            threshold = thresholds.loc[league, key]
            goals_at_threshold = league_data[league_data["player_rank"] <= threshold][
                "cumulative_goals"
            ].iloc[-1]
            percentages[label] = (goals_at_threshold - last_threshold) / total_goals
            last_threshold = goals_at_threshold
        else:
            percentages[label] = 1 - sum(percentages.values())

    return pd.Series(percentages)


proportions = pd.DataFrame(
    {league: calculate_percentages(league) for league in total_goals_by_league.index}
)
proportions = proportions[
    [
        c
        for c in proportions.columns
        if not ("Scotland" in c or "Belgium" in c or "Poland" in c or "Portugal" in c)
    ]
]

/tmp/ipykernel_17118/1727508686.py:47: RuntimeWarning: invalid value encountered in scalar divide
  percentages[label] = (goals_at_threshold - last_threshold) / total_goals
/tmp/ipykernel_17118/1727508686.py:47: RuntimeWarning: invalid value encountered in scalar divide
  percentages[label] = (goals_at_threshold - last_threshold) / total_goals

importlib.reload(graph)
graph.plot_player_goal_inequality(proportions, total_players_by_league)

player_league_goals_s = player_league_goals.copy()
player_league_goals_s["mean_goals"] = (
        player_league_goals_s["goals"] / player_league_goals_s["matches_played"]
)

player_league_goals_s["goals_norm"] = player_league_goals_s["goals"].map(
    lambda v: v if v < 150 else 150
)
player_league_goals_s["matches_played_norm"] = player_league_goals_s[
    "matches_played"
].map(lambda v: v if v < 100 else 100)

player_league_goals_s = player_league_goals_s[
    player_league_goals_s["matches_played"] > 4
    ]
bins = [-1, 0, 5, 10, float("inf")]
labels = ["0", "1-5", "5-10", "10+"]

player_league_goals_s["goal_bins"] = pd.cut(
    player_league_goals_s["goals"], bins=bins, labels=labels
)

g = sns.JointGrid(
    data=player_league_goals_s,
    x="matches_played",
    y="goals_norm",
    height=10,
    ratio=4,
)

g.ax_joint.scatter(
    player_league_goals_s["matches_played"],
    player_league_goals_s["goals_norm"],
    color="teal",
    marker="+",
    alpha=0.2,
    s=100,
    linewidth=0.5,  # thickness of the lines in the marker
)

g.ax_joint.hexbin(
    player_league_goals_s["matches_played"],
    player_league_goals_s["goals_norm"],
    gridsize=15,
    mincnt=5,
    cmap="Blues",
    alpha=1,
    # xscale='symlog',
    # yscale='symlog'
)

g.plot_marginals(sns.histplot, bins=20, fill=False)

custom_x_ticks = [5, 50, 100, 150, 200, 250, 300]
custom_x_labels = [5, 50, 100, 150, 200, 250, 300]

custom_y_ticks = [0, 20, 40, 60, 80, 100, 125]
custom_y_labels = [0, 20, 40, 60, 80, 100, ">125"]

g.ax_joint.set_xticks(custom_x_ticks)
g.ax_joint.set_xticklabels(custom_x_labels)

g.ax_joint.set_yticks(custom_y_ticks)
g.ax_joint.set_yticklabels(custom_y_labels)

g.fig.suptitle("Matches Played by Players vs Goals Scored", fontsize=18)
# plt.figtext(0.5, 0.025, "Your annotation here", ha="center", fontsize=12)

g.fig.subplots_adjust(top=0.9)
g.set_axis_labels("Total Matches Played", "Total Goals Scored")

<seaborn.axisgrid.JointGrid at 0x7fb5e3993df0>

plt.figure(figsize=(10, 10))  # Increase size

league_names = player_league_goals_s

league_names_sorted = (
    player_league_goals_s.groupby("league_name")["matches_played"]
    .median()
    .sort_values(ascending=False)
    .index.tolist()
)

# sns.violinplot(data=player_league_goals_s, x="matches_played", y="league_name", orient="y", fill=False, linewidth=1.1)


# plt.xscale('log', base=1.2)  # Set X-axis to logarithmic scale, chage base as needed
plt.xlabel("Total Matches Played (2010-2016)")  # Change X-axis label
# custom_ticks = [1, 10, 100, 150]  # Define custom tick positions
# custom_labels = ["1", "10", "100", "1000"]  # Define custom tick labels
# plt.xticks(custom_ticks, custom_labels)  # Set custom ticks
ax = sns.boxenplot(
    k_depth="tukey",
    fill=False,
    linewidth=2.1,
    data=player_league_goals_s,
    x="matches_played",
    y="league_name",
    orient="y",
    # color="b",
    line_kws=dict(linewidth=1.5, color="black"),
    order=league_names_sorted,
    width_method="linear",
)
for i, league in enumerate(league_names_sorted):
    median = player_league_goals_s[player_league_goals_s["league_name"] == league][
        "matches_played"
    ].median()
    if i == 0:
        ax.text(
            median,
            i - 0.65,
            f"Median",
            ha="center",
            va="center",
            fontsize=12,
            # rotation=270,
        )

    ax.text(
        median + 5,
        i,
        f"{median}",
        ha="center",
        va="center",
        fontsize=13,
        rotation=270,
    )

plt.xlabel("Total Matches Played (2010-2016)")  # Change X-axis label
plt.ylabel("\n")  # Change X-axis label
plt.title("Matches Count by Player ")  # Add title

Text(0.5, 1.0, 'Matches Count by Player ')

g2 = sns.displot(
    data=player_league_goals_s,
    x="matches_played_norm",
    hue="goal_bins",
    kind="kde",
    # rug=True,
    # rug_kws = dict(expand_margins=True, height=0.1, lw=0.01),
    # kwargs = dict(cumulative=False),
    # kind="kde",
    height=7,
    log_scale=[5, None],
    multiple="fill",
    clip=(0, None),
    palette="ch:rot=-.25,hue=1,light=.75",
)

custom_x_ticks_dist = [3, 5, 10, 15, 20, 30, 50, 100]
custom_x_labels_dist = [3, 5, 10, 15, 20, 30, 50, ">100"]

for ax in g2.axes.ravel():
    ax.set_xticks(custom_x_ticks_dist)
    ax.set_xticklabels(custom_x_labels_dist)
    ax.set_xlim([0, 100])

g2.fig.suptitle("Player Total Goals by Number of Matches", fontsize=16)
g2.fig.subplots_adjust(top=0.9)
plt.xlabel("Total Matches Played (2010-2016)")  # Change X-axis label
plt.ylabel("% of Players")  # Change X-axis label
g2.legend.set_title("Goals")

plt.show()

/tmp/ipykernel_17118/3368172736.py:23: UserWarning: Attempt to set non-positive xlim on a log-scaled axis will be ignored.
  ax.set_xlim([0, 100])

temp = full_df.sort_values(by=["team_id", "season_start_year", "stage"])
last_games = temp.loc[
    full_df.groupby(["team_id", "season_start_year"])["stage"].idxmax()
]
if VERBOSE:
    last_games

if VERBOSE:
    last_games["league_name"].value_counts()

pl_2015 = last_games[
    (last_games["league_name"] == "England Premier League")
    & (last_games["season_start_year"] == 2015)
    ]
league_last_games = last_games[
    [
        "team_id",
        "stage",
        "team_team_long_name",
        "cumulative_points",
        "league_name",
        "season_start_year",
        "team_rating",
    ]
]

league_last_games.sort_values(
    by=["league_name", "season_start_year", "cumulative_points"], ascending=False
)

league_last_games["normalized_points"] = league_last_games["cumulative_points"] / (
        league_last_games["stage"] * 3
)

winners = league_last_games.groupby(["league_name", "season_start_year"])[
    "cumulative_points"
].idxmax()
top_4_indices = (
    league_last_games.groupby(["league_name", "season_start_year"])["cumulative_points"]
    .nlargest(4)
    .reset_index(level=[0, 1], drop=True)
    .index
)

league_last_games["won_season"] = False
league_last_games.loc[winners, "won_season"] = True

league_last_games["top_4"] = False
league_last_games.loc[top_4_indices, "top_4"] = True

# team_rating is only for a specific game, we need to get the season average
avg_season_team_rating = (
    full_df.groupby(by=["team_id", "season_start_year"])["team_rating"]
    .mean()
    .reset_index()
    .rename(columns={"team_rating": "avg_season_team_rating"})
)

league_last_games = league_last_games.merge(
    avg_season_team_rating, on=["team_id", "season_start_year"]
)

/tmp/ipykernel_17118/1595024916.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  league_last_games["normalized_points"] = league_last_games["cumulative_points"] / (
/tmp/ipykernel_17118/1595024916.py:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  league_last_games["won_season"] = False
/tmp/ipykernel_17118/1595024916.py:38: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  league_last_games["top_4"] = False

def gini_coefficient(points):
    n = len(points)
    points_sorted = sorted(points)
    index = range(1, n + 1)
    numerator = sum((n + 1 - i) * y for i, y in zip(index, points_sorted))
    gini = (n + 1 - 2 * numerator / sum(points_sorted)) / n
    return gini


league_year_gini = (
    league_last_games.groupby(["league_name", "season_start_year"])["cumulative_points"]
    .agg(gini_coefficient)
    .reset_index()
)
league_year_gini.rename(columns={"cumulative_points": "gini_coefficient"}, inplace=True)

league_aggregated = (
    league_last_games.groupby("league_name")["cumulative_points"]
    .agg(list)
    .reset_index()
)

league_aggregated["gini_coefficient"] = league_aggregated["cumulative_points"].apply(
    gini_coefficient
)

league_aggregated.drop(columns=["cumulative_points"], inplace=True)
if VERBOSE:
    league_year_gini

grouped = league_last_games.groupby(["league_name", "season_start_year"])


def league_metrics(group):
    sorted_group = group.sort_values(by="cumulative_points", ascending=False)
    top_5 = sorted_group.head(5)
    bottom_5 = sorted_group.tail(5)
    return pd.Series(
        {
            "total_teams": group["team_team_long_name"].nunique(),
            "bottom_5_to_top_5_proportion": bottom_5["cumulative_points"].sum()
                                            / top_5["cumulative_points"].sum(),
            "gap_top1_top2": sorted_group.iloc[0]["cumulative_points"]
                             - sorted_group.iloc[1]["cumulative_points"],
            "closeness_top_5": top_5["cumulative_points"].std(),
            "std_dev_points": group["cumulative_points"].std(),
        }
    )


league_year_metrics = grouped.apply(league_metrics)


def gini_coefficient(points):
    n = len(points)
    points_sorted = sorted(points)
    index = range(1, n + 1)
    numerator = sum((n + 1 - i) * y for i, y in zip(index, points_sorted))
    gini = (n + 1 - 2 * numerator / sum(points_sorted)) / n
    return gini


league_year_gini = league_year_metrics.assign(
    gini_coefficient=grouped["cumulative_points"].apply(gini_coefficient)
)

if VERBOSE:
    league_year_gini

if VERBOSE:
    importlib.reload(graph)
    graph.render_dist_swarm_plot(league_last_games)

importlib.reload(graph)
graph.render_dist_kurtosis_swarm_plot(league_last_games)

/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 15.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 14.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 7.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 13.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 15.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 5.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 14.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 7.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/seaborn/categorical.py:3370: UserWarning: 13.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)

if VERBOSE:
    # Which team in the EPL received almost no points?
    league_last_games[
        league_last_games["league_name"] == "England Premier League"
        ].sort_values(by=["normalized_points"])

if VERBOSE:
    league_last_games[
        league_last_games["league_name"] == "Portugal Liga ZON Sagres"
        ].sort_values(by=["normalized_points"], ascending=False)
    league_last_games[league_last_games["league_name"] == "England Premier League"][
        "cumulative_points"
    ].tolist()

if VERBOSE:
    leicsert_Game = data.goals_df[data.goals_df["match_api_id"] == 1987598]
    leicsert_Game
    leicsert_Game = leicsert_Game.merge(
        data.player_df[["player_api_id", "player_name"]],
        left_on="scoring_player_id",
        right_on="player_api_id",
        how="left",
    ).drop("player_api_id", axis=1)
    leicsert_Game

if VERBOSE:
    leicsert_Game.merge(
        full_df[["match_api_id", "league_name", "team_id", "team_team_long_name"]],
        how="left",
        on=["team_id", "match_api_id"],
    )

importlib.reload(data_process)
goals_info_df = data_process.process_goal_info(data, full_df)
if VERBOSE:
    matches_df_short = data.matches_df_short.copy()
    matches_df_short["total_goals"] = (
            data.matches_df_short["home_team_goal"]
            + data.matches_df_short["away_team_goal"]
    )
    matches_df_short["total_goals"].sum()

goals_found_summary = (
    goals_info_df.groupby(["match_api_id"])["match_api_id"]
    .count()
    .to_frame()
    .rename(columns={"match_api_id": "found_goals"})
    .reset_index()
)

goals_verify = matches_df_short[
    ["match_api_id", "league_name", "home_team_goal", "away_team_goal", "total_goals"]
].merge(goals_found_summary, how="left", on="match_api_id")
goals_verify["found_goals"] = goals_verify["found_goals"].fillna(0)

if VERBOSE:
    goals_verify.groupby(["league_name"]).sum()

if VERBOSE:
    goals_verify[
        (goals_verify["total_goals"] != goals_verify["found_goals"])
        & (goals_verify["league_name"] == "England Premier League")
        ]

if VERBOSE:
    data.goals_df[data.goals_df["match_api_id"] == 489229]
    data.matches_df[data.matches_df["match_api_id"] == 489229]["goal"].iloc[0]
    data.matches_df[data.matches_df["match_api_id"] == 489229]

# Ratings for Polish Teams are mostly missing
league_last_games_for_ratings = league_last_games[
    league_last_games["league_name"] != "Poland Ekstraklasa"
    ]

importlib.reload(graph)
graph.render_dist_plot(league_last_games_for_ratings)

import numpy as np
import matplotlib.colors as mcolors

base_cmap = sns.color_palette("crest", as_cmap=True)

colors = base_cmap(np.arange(base_cmap.N))
colors[0, :] = [0, 0, 0, 0]
custom_cmap = mcolors.ListedColormap(colors)

p = sns.jointplot(
    data=league_last_games_for_ratings,
    x="avg_season_team_rating",
    y="normalized_points",
    kind="hex",
    height=8,
    cmap=custom_cmap,
)

p.fig.suptitle("Distribution of Team Rating by Points Scored by Team")
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.95)  # Reduce plot to make room

groups = league_last_games_for_ratings.groupby("league_name")
mean, std = groups.transform("mean"), groups.transform("std")
league_last_games_normalized = (
                                       league_last_games_for_ratings[mean.columns] - mean
                               ) / std

p = sns.jointplot(
    data=league_last_games_normalized,
    x="avg_season_team_rating",
    y="normalized_points",
    kind="hex",
    height=8,
    cmap=custom_cmap,
)

p.fig.suptitle("Team Rating normalized by League")
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.95)  # Reduce plot to make room

league_last_games_normalized["league_name"] = league_last_games_for_ratings[
    "league_name"
]

/tmp/ipykernel_17118/2216064224.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  mean, std = groups.transform("mean"), groups.transform("std")
/tmp/ipykernel_17118/2216064224.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.std is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  mean, std = groups.transform("mean"), groups.transform("std")

corr_map = (
    league_last_games_normalized.groupby("league_name")[
        ["avg_season_team_rating", "normalized_points"]
    ]
    .corr()
    .reset_index()
)

corr_map = (
    corr_map[corr_map["level_1"] != "avg_season_team_rating"]
    .drop(columns=["normalized_points"])
    .sort_values(by="avg_season_team_rating", ascending=False)
)
selected_leagues = corr_map.iloc[[0, -1]]["league_name"].tolist()

league_last_games_normalized["league_hue"] = league_last_games_for_ratings[
    "league_name"
].map(lambda l: l if l in selected_leagues else "Other Leagues")

g = sns.lmplot(
    data=league_last_games_normalized,
    x="avg_season_team_rating",
    y="normalized_points",
    hue="league_hue",
    height=10,
    scatter_kws={"s": 15, "alpha": 0.5},
)

g.ax.set_ylim([-2.5, 2.5])
g.ax.set_xlim([-2.5, 2.5])
g.fig.suptitle("Relationship Between Team Player Rating and Points in Season")

g.set_axis_labels("Team Rating (standardized)", "Normalized Points (standardized)")

<seaborn.axisgrid.FacetGrid at 0x7fb5e906e530>

temp = full_df[
    ["league_name", "season_start_year", "stage", "cumulative_points", "team_rating"]
].sort_values(by=["stage"])

temp = full_df[
    ["league_name", "season_start_year", "stage", "cumulative_points", "team_rating"]
].sort_values(by=["stage"])


def top_bottom_mean_normalized(group):
    normalized_rating = (group["team_rating"] - group["team_rating"].min()) / (
            group["team_rating"].max() - group["team_rating"].min()
    )

    normalized_rating = normalized_rating * 1000
    overall_mean = normalized_rating.mean()
    sorted_group = normalized_rating.sort_values()

    n = len(sorted_group)
    top_bottom_n = int(0.20 * n)

    top_mean = sorted_group[-top_bottom_n:].mean()
    bottom_mean = sorted_group[:top_bottom_n].mean()

    return pd.Series(
        {
            "overall_mean_normalized": overall_mean,
            "top_20%_mean_normalized": top_mean,
            "bottom_20%_mean_normalized": bottom_mean,
        }
    )


normalized_results = temp.groupby(["league_name", "stage"]).apply(
    top_bottom_mean_normalized
)

normalized_results = normalized_results.reset_index()
reshaped_data = normalized_results.reset_index()[
    ["league_name", "stage", "overall_mean_normalized"]
]

if VERBOSE:

    plt.figure(figsize=(10, 6))
    sns.lineplot(
        data=normalized_results, x="stage", y="overall_mean_normalized", hue="league_name"
    )

    for league in normalized_results["league_name"].unique():
        league_data = normalized_results[normalized_results["league_name"] == league]
        plt.fill_between(
            league_data["stage"],
            league_data["top_20%_mean_normalized"],
            league_data["bottom_20%_mean_normalized"],
            alpha=0.2,
        )

    plt.ylim([0, 1000])
    plt.title("Normalized Team Rating Mean with Confidence Intervals by League and Stage")
    plt.xlabel("Stage")
    plt.ylabel("Normalized Rating")

    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
    plt.show()

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

normalized_results = temp.groupby(["league_name", "stage"]).apply(
    top_bottom_mean_normalized
)
normalized_results = normalized_results.reset_index()

average_trend = normalized_results.groupby("stage")["overall_mean_normalized"].mean()


def is_significantly_different(league_data, average_trend):
    slope, _, _, p_value, _ = stats.linregress(
        league_data["stage"], league_data["overall_mean_normalized"]
    )
    avg_slope = stats.linregress(average_trend.index, average_trend.values)[0]
    return np.abs(slope - avg_slope) > threshold and p_value < alpha


threshold = 0.1
alpha = 0.05

significant_leagues = []
for league in normalized_results["league_name"].unique():
    league_data = normalized_results[normalized_results["league_name"] == league]
    if is_significantly_different(league_data, average_trend):
        significant_leagues.append(league)

plt.figure(figsize=(10, 6))

for league in significant_leagues:
    league_data = normalized_results[normalized_results["league_name"] == league]
    sns.lineplot(
        data=league_data,
        x="stage",
        y="overall_mean_normalized",
        label=league,
        alpha=0.25,
    )

sns.lineplot(data=average_trend, label="Average Trend")

plt.ylim([0, 1000])
plt.title("Change in Team Rating During Season")
plt.xlabel("Stage")
plt.ylabel("Normalized Rating")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()

normalized_results = temp.groupby(["league_name", "stage"]).apply(
    top_bottom_mean_normalized
)
normalized_results = normalized_results.reset_index()

average_trend = normalized_results.groupby("stage").agg(
    overall_mean=("overall_mean_normalized", "mean"),
    top_20_mean=("top_20%_mean_normalized", "mean"),
    bottom_20_mean=("bottom_20%_mean_normalized", "mean"),
)


def calc_r2_and_pval(x, y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    return r_value ** 2, p_value


significant_leagues = []
alpha = 0.05  # significance level
for league in normalized_results["league_name"].unique():
    league_data = normalized_results[normalized_results["league_name"] == league]
    r2, pval = calc_r2_and_pval(
        league_data["stage"], league_data["overall_mean_normalized"]
    )
    if pval < alpha:
        significant_leagues.append((league, r2, pval))

plt.figure(figsize=(10, 6))

plt.fill_between(
    average_trend.index,
    average_trend["top_20_mean"],
    average_trend["bottom_20_mean"],
    alpha=0.2,
    label="Average Trend Confidence Interval",
)

r2_avg, p_avg = calc_r2_and_pval(average_trend.index, average_trend["overall_mean"])
sns.lineplot(
    data=average_trend,
    x=average_trend.index,
    y="overall_mean",
    label=f"Average Trend\n(R²: {r2_avg:.2f}) {p_avg:.2f}",
    color="black",
)

for league, r2, pval in significant_leagues:
    league_data = normalized_results[normalized_results["league_name"] == league]
    sns.lineplot(
        data=league_data,
        x="stage",
        y="overall_mean_normalized",
        label=f"{league}\n(R²: {r2:.2f}, p: {pval:.2f})",
    )

plt.ylim([0, 1000])
plt.title("Change in Team Ratings Over Season")
plt.xlabel("Stage")
plt.ylabel("Normalized Rating")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()

team_ratio_ratio_df = full_df[
    ["team_rating", "opponent_team_rating", "league_name", "result"]
]

team_ratio_ratio_df = team_ratio_ratio_df[team_ratio_ratio_df["team_rating"].notna()]
team_ratio_ratio_df["ratio"] = round(
    team_ratio_ratio_df["team_rating"] / team_ratio_ratio_df["opponent_team_rating"], 3
)

std_dev = team_ratio_ratio_df["ratio"].std()
team_ratio_ratio_df["capped_ratio"] = team_ratio_ratio_df["ratio"].clip(
    1 - std_dev * 2, 1 + std_dev * 2
)

if VERBOSE:
    display(team_ratio_ratio_df["ratio"].describe())
    display(team_ratio_ratio_df["capped_ratio"].describe())
    display(team_ratio_ratio_df["ratio"].std())

X = team_ratio_ratio_df[["capped_ratio"]].values  # Predictor
y = team_ratio_ratio_df["result"].values  #

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.01, random_state=42
)

model = LogisticRegression()
model.fit(X_train, y_train)

x_values = np.linspace(min(X), max(X), 300).reshape(-1, 1)
y_values = model.predict_proba(x_values)[:, 2]  # Probability of winning

bins = np.linspace(min(X), max(X), 20)
bin_indices = [np.argmax(x <= bins) - 1 for x in X_test[:, 0]]

fig, ax1 = plt.subplots(figsize=(10, 6))

sns.lineplot(x=x_values.flatten(), y=y_values, ax=ax1, color="blue")

ax1.set_xlabel("Team Rating Ratio")
ax1.set_ylabel("Probability of Winning", color="blue")
ax1.axhline(y=1 / 3, color="red", linestyle="--", label="Baseline win Rate (=.33)")
ax1.legend(loc="lower center", ncol=3, bbox_to_anchor=(0.5, -0.25))

ax1.set_ylim([0, 1])

mid_points = (bins[:-1] + bins[1:]) / 2

ax2 = ax1.twinx()
sns.histplot(
    team_ratio_ratio_df["capped_ratio"], ax=ax2, color="gray", bins=30, kde=False
)
ax2.set_ylabel("Number of Samples", color="gray")
ax2.grid(False)

ax2.set_ylim([0, 100000])

coef = model.coef_[0][0]

plt.title("Relationship between Team Winning and Rating Ratio")
plt.show()

features = (
        feature_select.FeatureSet.Base
        | feature_select.FeatureSet.TeamSeasonStats
        | feature_select.FeatureSet.TeamRatingStats
)

feature_names = feature_select.get_feature_sets(features)
selected_df = full_df[feature_names]

if VERBOSE:
    print("Features used in PCA")
    print(feature_names)

    print("\n--\n")

features_no_nan = selected_df.dropna()

if VERBOSE:
    print(f"Drop NaN {len(selected_df)} -> {len(features_no_nan)} samples")

13

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_no_nan_std = pd.DataFrame(
    scaler.fit_transform(features_no_nan), columns=features_no_nan.columns
)

from workbench.src import shared

importlib.reload(shared)
_explained_pc = shared.get_pca_explained(features_no_nan_std)
_explained_pc

importlib.reload(graph)
graph.render_pca_component_plot(_explained_pc, title="PCA (features used in the 'Full' model) Cumulative Var.")

data.team_attrs_df

team_styles = [
    "buildUpPlaySpeed",
    "buildUpPlaySpeedClass",
    "buildUpPlayDribbling",
    "buildUpPlayDribblingClass",
    "buildUpPlayPassing",
    "buildUpPlayPassingClass",
    "buildUpPlayPositioningClass",
    "chanceCreationPassing",
    "chanceCreationPassingClass",
    "chanceCreationCrossing",
    "chanceCreationCrossingClass",
    "chanceCreationShooting",
    "chanceCreationShootingClass",
    "chanceCreationPositioningClass",
    "defencePressure",
    "defencePressureClass",
    "defenceAggression",
    "defenceAggressionClass",
    "defenceTeamWidth",
    "defenceTeamWidthClass",
    "defenceDefenderLineClass",
]

team_attrs_df_full = data.team_attrs_df
team_attrs_df = team_attrs_df_full[
    team_styles
]

if VERBOSE:
    print("Features used in PCA")
    print(feature_select.get_feature_sets(feature_select.FeatureSet.TeamStyle))

    print("\n--\n")

team_attrs_df_no_nan = team_attrs_df.dropna()
if VERBOSE:
    print(f"Drop NaN {len(team_attrs_df)} -> {len(team_attrs_df_no_nan)} samples")

non_numeric_bool_columns = team_attrs_df_no_nan.select_dtypes(
    exclude=["number", "bool"]
).columns.tolist()
team_attrs_df_encoded = pd.get_dummies(
    team_attrs_df_no_nan, columns=non_numeric_bool_columns
)

scaler = StandardScaler()
team_attrs_df_no_nan_std = pd.DataFrame(
    scaler.fit_transform(team_attrs_df_encoded), columns=team_attrs_df_encoded.columns
)

_explained_pc_team_attrs = shared.get_pca_explained(team_attrs_df_no_nan_std)

importlib.reload(graph)
graph.render_pca_component_plot(_explained_pc_team_attrs)

team_styles

['buildUpPlaySpeed',
 'buildUpPlaySpeedClass',
 'buildUpPlayDribbling',
 'buildUpPlayDribblingClass',
 'buildUpPlayPassing',
 'buildUpPlayPassingClass',
 'buildUpPlayPositioningClass',
 'chanceCreationPassing',
 'chanceCreationPassingClass',
 'chanceCreationCrossing',
 'chanceCreationCrossingClass',
 'chanceCreationShooting',
 'chanceCreationShootingClass',
 'chanceCreationPositioningClass',
 'defencePressure',
 'defencePressureClass',
 'defenceAggression',
 'defenceAggressionClass',
 'defenceTeamWidth',
 'defenceTeamWidthClass',
 'defenceDefenderLineClass']

import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

plt.figure(figsize=(32, 16))
dendrogram = sch.dendrogram(sch.linkage(team_attrs_df_encoded, method="ward"))

# Display dendrogram
plt.show()

from workbench.src import stats_utils

importlib.reload(stats_utils)
optimal_clusters = stats_utils.find_optimal_clusters(team_attrs_df_encoded)

/home/paulius/data/projects/football_m2_s4/workbench/src/stats_utils.py:172: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics_df["score_davies_bouldin"] = 1 / metrics_df["score_davies_bouldin"]

optimal_clusters

from scipy.cluster.hierarchy import linkage, fcluster

cluster_settings = {
    "": "38",
    "name": "Hierarchical",
    "component_method": "PCA",
    "n_components": "3.0",
    "method": "ward",
    "cutoff": "34.0",
    "eps": "NaN",
    "min_samples": "NaN",
    "n_clusters": "5",
    "min_count_in_cluster": "123",
    "score_silhouette": "0.253836",
    "score_calinski_harabasz": "479.193982",
    "score_davies_bouldin": "1.088382",
    "score": "0.738100",
}
# pca_df_3_comp = selected_pca_df = pca_df[["PC1", "PC2", "PC3"]]

Z = linkage(team_attrs_df_encoded, method=cluster_settings["method"])

clusters = fcluster(Z, cluster_settings["cutoff"], criterion="distance")

# df_reduced_tsne_cluster = df_reduced_tsne.copy()
# df_reduced_tsne_cluster["cluster"] = clusters

# pca_df_cluster = pca_df.copy()
# pca_df_cluster["cluster"] = clusters

source_df_cluster = team_attrs_df_encoded.copy()
source_df_cluster["cluster"] = clusters

	var	PC	cum_var
0	0.290852	PC1	0.290852
1	0.266693	PC2	0.557544
2	0.126880	PC3	0.684425
3	0.076688	PC4	0.761113
4	0.065649	PC5	0.826762
5	0.058473	PC6	0.885235
6	0.049256	PC7	0.934491
7	0.028955	PC8	0.963446
8	0.013163	PC9	0.976609
9	0.012951	PC10	0.989560

	id	team_fifa_api_id	team_api_id	date	buildUpPlaySpeed	buildUpPlaySpeedClass	buildUpPlayDribbling	buildUpPlayDribblingClass	buildUpPlayPassing	buildUpPlayPassingClass	buildUpPlayPositioningClass	chanceCreationPassing	chanceCreationPassingClass	chanceCreationCrossing	chanceCreationCrossingClass	chanceCreationShooting	chanceCreationShootingClass	chanceCreationPositioningClass	defencePressure	defencePressureClass	defenceAggression	defenceAggressionClass	defenceTeamWidth	defenceTeamWidthClass	defenceDefenderLineClass	days_after_first_date	team_long_name	league_id	id_x	id_y	country_id	league_name
1112	1113	874	1601	2010-02-22	30	Slow	NaN	Little	40	Mixed	Organised	50	Normal	35	Normal	70	Lots	Organised	65	Medium	60	Press	50	Normal	Cover	584	Ruch Chorzów	15722	120	15722	15722	Poland Ekstraklasa
1113	1114	874	1601	2011-02-22	48	Balanced	NaN	Little	51	Mixed	Organised	68	Risky	67	Lots	51	Normal	Organised	46	Medium	48	Press	50	Normal	Cover	949	Ruch Chorzów	15722	120	15722	15722	Poland Ekstraklasa
1114	1115	874	1601	2012-02-22	53	Balanced	NaN	Little	55	Mixed	Organised	44	Normal	65	Normal	50	Normal	Organised	43	Medium	44	Press	49	Normal	Cover	1314	Ruch Chorzów	15722	120	15722	15722	Poland Ekstraklasa
1115	1116	874	1601	2013-09-20	53	Balanced	NaN	Little	55	Mixed	Organised	44	Normal	65	Normal	50	Normal	Organised	43	Medium	44	Press	49	Normal	Cover	1890	Ruch Chorzów	15722	120	15722	15722	Poland Ekstraklasa
1116	1117	874	1601	2014-09-19	53	Balanced	48.0	Normal	38	Mixed	Organised	66	Normal	65	Normal	50	Normal	Organised	43	Medium	44	Press	49	Normal	Cover	2254	Ruch Chorzów	15722	120	15722	15722	Poland Ekstraklasa
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68	69	112513	158085	2014-09-19	69	Fast	66.0	Normal	39	Mixed	Organised	55	Normal	59	Normal	46	Normal	Organised	35	Medium	37	Press	37	Normal	Cover	2254	FC Arouca	17642	49	17642	17642	Portugal Liga ZON Sagres
69	70	112513	158085	2015-09-10	65	Balanced	66.0	Normal	39	Mixed	Organised	55	Normal	59	Normal	46	Normal	Organised	37	Medium	39	Press	37	Normal	Cover	2610	FC Arouca	17642	49	17642	17642	Portugal Liga ZON Sagres
274	275	112409	208931	2014-09-19	32	Slow	46.0	Normal	31	Short	Organised	47	Normal	36	Normal	54	Normal	Organised	46	Medium	44	Press	51	Normal	Cover	2254	Carpi	10257	19	10257	10257	Italy Serie A
275	276	112409	208931	2015-09-10	80	Fast	45.0	Normal	65	Mixed	Organised	70	Risky	40	Normal	50	Normal	Organised	25	Deep	55	Press	35	Normal	Cover	2610	Carpi	10257	19	10257	10257	Italy Serie A
858	859	111560	274581	2015-09-10	50	Balanced	50.0	Normal	50	Mixed	Organised	50	Normal	50	Normal	50	Normal	Organised	45	Medium	45	Press	50	Normal	Cover	2610	Royal Excel Mouscron	1	30	1	1	Belgium Jupiler League

EDA¶

Football Leagues¶

Parsing Goal Events and Player Scoring Data¶

Season Points Analysis¶

Team Performance Analysis¶

Team Rating¶

Team Rating vs Team Position at the End of Season¶

Change in Team Rating Over Season¶

PCA Analysis¶

Team Attributes PCA¶

Clustering¶

	name	component_method	n_components	method	cutoff	eps	min_samples	n_clusters	min_count_in_cluster	score_silhouette	score_calinski_harabasz	score_davies_bouldin	score
13	Hierarchical	None	None	ward	230	None	None	2	198	0.079399	44.468856	3.078634	0.800000
14	Hierarchical	None	None	ward	240	None	None	2	198	0.079399	44.468856	3.078634	0.800000
15	Hierarchical	None	None	ward	250	None	None	2	198	0.079399	44.468856	3.078634	0.800000
7	Hierarchical	None	None	ward	170	None	None	5	31	0.065273	37.352303	2.342327	0.506753
5	Hierarchical	None	None	ward	150	None	None	6	31	0.067596	36.096352	2.411916	0.506391
6	Hierarchical	None	None	ward	160	None	None	6	31	0.067596	36.096352	2.411916	0.506391
10	Hierarchical	None	None	ward	200	None	None	3	108	0.062023	41.589361	2.797015	0.476989
11	Hierarchical	None	None	ward	210	None	None	3	108	0.062023	41.589361	2.797015	0.476989
12	Hierarchical	None	None	ward	220	None	None	3	108	0.062023	41.589361	2.797015	0.476989
0	Hierarchical	None	None	ward	100	None	None	16	4	0.071681	26.141644	1.886573	0.470284
3	Hierarchical	None	None	ward	130	None	None	8	27	0.062360	32.986357	2.166632	0.396250
2	Hierarchical	None	None	ward	120	None	None	10	27	0.063856	30.742088	2.118654	0.382587
8	Hierarchical	None	None	ward	180	None	None	4	92	0.055599	39.727458	2.583317	0.357206
9	Hierarchical	None	None	ward	190	None	None	4	92	0.055599	39.727458	2.583317	0.357206
4	Hierarchical	None	None	ward	140	None	None	7	27	0.058556	34.458802	2.291476	0.339944
1	Hierarchical	None	None	ward	110	None	None	11	23	0.060786	29.607235	2.129367	0.303922