from pathlib import Path
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer


TRAIN_DATA_DIR = Path('data/train')


### Create Lageed Features

def player_lag_features(df, features,for_pred=False,gw = 0):    
    df_new = df.copy()
    player_lag_vars = []
    
    # need minutes for per game stats, add to front of list
    features.insert(0, {'minutes' :[1,2,3,4,5]})

    # calculate totals for each lag period
    for item in features:
        feature ,lags = tuple(item.items())[0]
        for lag in lags:
            feature_name = feature + '_last_' + str(lag)
            minute_name = 'minutes_last_' + str(lag)
            if for_pred:
                df_new[feature_name] = df_new.groupby(['name'])[feature].transform(lambda x: x.rolling(min_periods=1, 
                                                                                            window=lag+1).sum())
            else:
                df_new[feature_name] = df_new.groupby(['name'])[feature].transform(lambda x: x.rolling(min_periods=1, 
                                                                                            window=lag+1).sum() - x)
            df_new[feature_name] = df_new[feature_name] if lag < gw else 0

            if feature != 'minutes':

                pg_feature_name = feature + '_pg_last_' + str(lag)
                player_lag_vars.append(pg_feature_name)
                
                df_new[pg_feature_name] = 90 * df_new[feature_name] / df_new[minute_name]
                
                # some cases of -1 points and 0 minutes cause -inf values
                # change these to NaN
                df_new[pg_feature_name] = df_new[pg_feature_name].replace([np.inf, -np.inf], np.nan)
            
            else:
                if lag not in [1,3,5]: 
                    player_lag_vars.append(minute_name) 
                else: 
                    pass
    return df_new, player_lag_vars

# functions to get validation set indexes
# training will always be from start of data up to valid-start
# first function to get the validation set points for a given season and gameweek
def validation_gw_idx(df, season, gw, length):
    
    valid_start = df[(df['gw'] == gw) & (df['season'] == season)].index.min()
    valid_end = df[(df['gw'] == min(gw+length-1, 38)) & (df['season'] == season)].index.max()

    return (int(valid_start), int (valid_end))


#   create dataset with adjusted post-validation lag numbers
def create_lag_train(df, cat_vars, cont_vars, player_lag_vars, dep_var, valid_season, valid_gw, valid_len):

    # get all the lag data for the current season up to the first validation gameweek
    player_lag_vals = df[(df['season'] == valid_season) & 
                         (df['gw'] >= valid_gw)][['name', 'kickoff_time'] + player_lag_vars]
    
    player_lag_vals = player_lag_vals[player_lag_vals['kickoff_time'] == 
                                      player_lag_vals.groupby('name')['kickoff_time'].transform('min')]
                                                   
    player_lag_vals = player_lag_vals.drop('kickoff_time', axis=1)
    
    # get the validation start and end indexes
    valid_start, valid_end = validation_gw_idx(df, valid_season, valid_gw, valid_len)
    train_idx = range(valid_start)
    valid_idx = range(valid_start, valid_end + 1)
    # split out train and validation sets
    # do not include lag vars in validation set
    cat_vars = list(set(['name' ,'id'] + cat_vars))
    train = df[cat_vars + cont_vars +
               player_lag_vars  + 
               dep_var].iloc[train_idx]
    valid = df[cat_vars + cont_vars + dep_var].iloc[valid_idx]


    # add in lag vars
    # will be the same for all validation gameweeks
    valid = valid.merge(player_lag_vals, on='name', how='left')
        

    # concatenate train and test again
    lag_train_df = pd.concat([train, valid], sort=True).reset_index(drop=True)
    return lag_train_df, train_idx, valid_idx

def preprocess_data(validation_season: str = '2526', validation_gw: int = 1, validation_len: int = 1):
    train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train_v1.csv', 
                       dtype={'season':str,})
    # add a bunch of player lag features
    lag_train_df, player_lag_vars = player_lag_features(train_df, [{'goals_scored': [  2, 3, 5]},{'assists': [  2, 3, 5]},
                                                               {'goals_conceded': [ 1, 2, 3, 5]},
                                                               {'clean_sheets': [  2, 3, 5]},
                                                               {'yellow_cards': [  4]}
                                                             ])
    
    valid_season = validation_season
    valid_gw = validation_gw
    valid_len = validation_len
    cat_vars = ['position', 'season']
    cont_vars = ['gw','XA', 'XG', 'XGC','strength_h',
       'strength_a', 'team_gw_diff']
    dep_var = ['total_points']

    # we want to set gw and season as ordered categorical variables
    # need lists with ordered categories
    ordered_gws = CategoricalDtype(categories=list(range(1,39)), ordered=True)
    ordered_seasons = CategoricalDtype(categories=['2223', '2324', '2425','2526'], ordered=True)
    # set as categories with correct order 
    lag_train_df['gw'] = lag_train_df['gw'].astype(ordered_gws)
    lag_train_df['season'] = lag_train_df['season'].astype(ordered_seasons)
    train_valid_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

    # The way we calculate our lag features means that there will be null values in our dataset
    train_valid_df[player_lag_vars] = train_valid_df[player_lag_vars].fillna(0)

    # The random forest regressor will only take numbers as inputs, so we need to transform our caterogical features into a format that
    # the random forest regressor object will be able to use, numbers instead of strings in one or more columns.

    # split out dependent variable
    X, y = train_valid_df[cat_vars + cont_vars  + player_lag_vars].copy(), train_valid_df[dep_var].copy()
    # since position is categorical, it should be a string
    X['position'] = X['position'].apply(str)

    # need to transform season
    enc = LabelEncoder()
    X['season'] = enc.fit_transform(X['season'])

    X_dict = X.to_dict("records")

    # Create the DictVectorizer object: dv
    dv = DictVectorizer(sparse=False, separator='_')

    # Apply dv on df: df_encoded
    X_encoded = dv.fit_transform(X_dict)

    X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

    # split out training and validation sets
    X_df = X_df.drop(columns= ["gw",'season','position_AM',],errors="ignore")
    X_train = X_df.loc[train_idx]
    y_train = y.loc[train_idx]
    X_test = X_df.loc[valid_idx]
    y_test = y.loc[valid_idx]

    return X_train, y_train, X_test, y_test


def preprocess_data_new(validation_season: str = '2526', validation_gw: int = 1, validation_len: int = 1):
    train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train_v1.csv', 
                       dtype={'season':str,})
    # add a bunch of player lag features
    lag_train_df, player_lag_vars = player_lag_features(train_df, [{'goals_scored': [  2, 3, 5]},{'assists': [  2, 3, 5]},
                                                               {'goals_conceded': [ 1, 2, 3, 5]},
                                                               {'clean_sheets': [  2, 3, 5]},
                                                               {'yellow_cards': [  4]}
                                                             ])
    
    valid_season = validation_season
    valid_gw = validation_gw
    valid_len = validation_len
    cat_vars = ['position', 'season']
    cont_vars = ['gw','XA', 'XG', 'XGC','strength_h',
        'strength_a', 'team_gw_diff']
    dep_var = ['total_points']

    # we want to set gw and season as ordered categorical variables
    # need lists with ordered categories
    ordered_gws = CategoricalDtype(categories=list(range(1,39)), ordered=True)
    ordered_seasons = CategoricalDtype(categories=['2223', '2324', '2425','2526'], ordered=True)
    # set as categories with correct order 
    lag_train_df['gw'] = lag_train_df['gw'].astype(ordered_gws)
    lag_train_df['season'] = lag_train_df['season'].astype(ordered_seasons)
    train_valid_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

    # The way we calculate our lag features means that there will be null values in our dataset
    train_valid_df[player_lag_vars] = train_valid_df[player_lag_vars].fillna(0)

    # The random forest regressor will only take numbers as inputs, so we need to transform our caterogical features into a format that
    # the random forest regressor object will be able to use, numbers instead of strings in one or more columns.

    # split out dependent variable
    X, y = train_valid_df[cat_vars + cont_vars  + player_lag_vars].copy(), train_valid_df[dep_var].copy()
    # since position is categorical, it should be a string
    X['position'] = X['position'].apply(str)

    # need to transform season
    enc = LabelEncoder()
    X['season'] = enc.fit_transform(X['season'])

    X_dict = X.to_dict("records")

    # Create the DictVectorizer object: dv
    dv = DictVectorizer(sparse=False, separator='_')

    # Apply dv on df: df_encoded
    X_encoded = dv.fit_transform(X_dict)

    X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

    # split out training and validation sets
    X_df = X_df.drop(columns= ["gw",'season', 'position_FWD',
       'position_GK', 'position_MID'])
    X_train = X_df.loc[train_idx]
    y_train = y.loc[train_idx]
    X_test = X_df.loc[valid_idx]
    y_test = y.loc[valid_idx]

    return X_train, y_train, X_test, y_test

if __name__ == "__main__":
    preprocess_data()
    print("Data preprocessing complete.")