Predizione dei risultati calcistici di Serie A e Serie B

Oggi vi presento un notebook realizzato per la realizzazione di un modello di classificazione che predice il risultato delle partite di calcio della serie A e Serie B.

Ecco il link del notebook: prediction_result_of_football_match.ipynb

Librerie necessarie all'utilizzo

!pip install --upgrade pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import requests
from datetime import date
from bs4 import BeautifulSoup
from google.colab import drive
import math
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
from datetime import datetime
from joblib import dump, load 
from pathlib import Path
from sklearn.metrics import mean_squared_error
import time
import datetime
from sklearn.preprocessing import StandardScaler
import sklearn
import urllib
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
drive.mount('/content/drive')

import warnings
warnings.simplefilter("ignore")

Data extraction

Riceverò i dati attraverso il seguente url.

URL = "https://www.football-data.co.uk/italym.php"
resp = requests.get(URL)
print(resp.status_code)

# creazione di un oggetto "soup"
data = BeautifulSoup(resp.content)

def Dataframe_RES(path):
  try:
    res = urllib.request.urlopen('https://www.football-data.co.uk/'+path)
    df = pd.read_csv(res)
    return df
  except:
    error.append('https://www.football-data.co.uk/'+path)
    print("An exception occurred")

error=[]
SERIE_A=[]
SERIE_B=[]
for link in data.find_all('a'):
  l=link.get('href')
  if (str(l)[len(l)-4:]==".csv"):
    path=str(link.get('href'))
    print(path,path[len(path)-6:len(path)-4])
    if (path[len(path)-6:len(path)-4]=='I1'):
      SERIE_A.append(Dataframe_RES(path))
      print('SERIE_A')
    if (path[len(path)-6:len(path)-4]=='I2'):
      SERIE_B.append(Dataframe_RES(path))
      print('SERIE_B')

print(len(error))
for e in error:
  res = urllib.request.urlopen(e)
  df = pd.read_csv(res, encoding ='latin1', error_bad_lines=False)
  if (e[len(e)-6:len(e)-4]=='I1'):
      SERIE_A.append(df)
      print('SERIE_A')
  if (e[len(e)-6:len(e)-4]=='I2'):
    SERIE_B.append(df)
    print('SERIE_B')

Note per i dati di calcio

Tutti i dati sono in formato csv, pronti per l'uso all'interno di applicazioni di fogli di calcolo standard. Si prega di notare che alcune abbreviazioni non sono più in uso (in particolare le quote di specifici bookmaker non più utilizzate) e si riferiscono ai dati raccolti nelle stagioni precedenti. Per un elenco aggiornato di quali bookmaker sono inclusi nel set di dati, visitare http://www.football-data.co.uk/matches.php

Per informazioni sulle chiavi del Dataset: README

ds_serieA=SERIE_A[0]
for da in SERIE_A[1:]:
  ds_serieA=ds_serieA.append(da, ignore_index=True).reset_index(drop=True)
ds_serieB=SERIE_B[0]
for db in SERIE_B[1:]:
  ds_serieB=ds_serieB.append(db, ignore_index=True).reset_index(drop=True)
ds_serieB

matches=ds_serieA.append(ds_serieB, ignore_index=True).reset_index(drop=True)

Data Preparation

x=matches.isnull().sum()
eliminate=[]
count=0
for col in range (len(x)):
  if(x[col]>=len(matches)-1000):
    count+=1
    matches=matches.drop([x.keys()[col]],axis=1)
    eliminate.append(x.keys()[col])
print('count=',count)
matches['FTR']
matches['risultato']=matches['FTR']
matches['risultato'][matches['risultato']=="A"]="2"
matches['risultato'][matches['risultato']=="D"]="X"
matches['risultato'][matches['risultato']=="H"]="1"
# matches
dataset=matches[["Date","HomeTeam","AwayTeam","B365H","B365D","B365A","BWH","BWD","BWA","IWH","IWD","IWA","PSH","PSD","PSA","WHH","WHD","WHA","VCH","VCD","VCA","risultato"]]
dataset=dataset.dropna()
dataset

new_date=[]
for date in dataset['Date']:
  date_string = str(date)
  timestamp=0.0
  try:
    timestamp = time.mktime(datetime.datetime.strptime(date_string, "%d/%m/%Y").timetuple())
  except:
    date_string=date_string[:len(date_string)-2]+'20'+date_string[len(date_string)-2:]
    timestamp = time.mktime(datetime.datetime.strptime(date_string, "%d/%m/%Y").timetuple())
  new_date.append(timestamp)
dataset['Date']=new_date
list_home_team=dataset['HomeTeam'].unique()
list_home_team.sort()
list_away_team=dataset['AwayTeam'].unique()
list_away_team.sort()
print(list_away_team==list_home_team)
team_dict=dict(zip(list_home_team,np.arange(len(list_home_team))))
for k in team_dict:
  dataset['HomeTeam'][matches['HomeTeam']==k]=team_dict[k]
  dataset['AwayTeam'][matches['AwayTeam']==k]=team_dict[k]

dataset

labels=['1','2','X']
print(dataset['risultato'].value_counts())
X=dataset.drop('risultato',axis=1)
Y=dataset['risultato']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,stratify=Y, random_state=42)
x_validation, x_test, y_validation, y_test = train_test_split(x_test, y_test, test_size=0.33,stratify=y_test, random_state=42)

y_train=y_train.astype('str')
y_validation=y_validation.astype('str')
y_test=y_test.astype('str')
print(y_train.value_counts())
scaler=StandardScaler()
x_train=pd.DataFrame(scaler.fit_transform(x_train))
x_validation=pd.DataFrame(scaler.transform(x_validation))
x_test=pd.DataFrame(scaler.transform(x_test))

MODEL TRAINING

#caricamento 
def load_model(model_path, model):
    model=load(model_path)
    return model

#salvataggio
def save_model(model,model_path):
    dump(model,model_path)
    return model
    
def verifica_esistenza_modello(file_name):
    esiste=False
    try:
        my_file = Path(file_name)
        if my_file.is_file():
          esiste=True
    except IOError:
        print("File not accessible")
    print("Il modello addestrato esiste?",esiste)
    return esiste

SVM:

from sklearn.svm import SVC
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_SVM_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# SVC
if exist_model==False:
  model_SVC=SVC()
  # define the ovr strategy
  ovr_SVC = OneVsRestClassifier(model_SVC)
  ovr_SVC.fit(x_train, y_train)
  ovr_SVC=save_model(ovr_SVC,path_name)
else:
  model_SVC=SVC()
  ovr_SVC = OneVsRestClassifier(model_SVC)
  ovr_SVC=load_model(path_name, ovr_SVC)
y_pred_SVC=ovr_SVC.predict(x_validation)
print(accuracy_score(y_validation,y_pred_SVC))
print(classification_report(y_validation,y_pred_SVC))

RandomForestClassifier:

from sklearn.ensemble import RandomForestClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_RandomForestClassifier_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# RandomForest
if exist_model==False:
  model_RandomForest=RandomForestClassifier()
  # define the ovr strategy
  ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
  ovr_RandomForest.fit(x_train, y_train)
  ovr_RandomForest=save_model(ovr_RandomForest,path_name)
else:
  model_RandomForest=RandomForestClassifier()
  ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
  ovr_RandomForest=load_model(path_name, ovr_RandomForest)
y_pred_RandomForest=ovr_RandomForest.predict(x_validation)
print(accuracy_score(y_validation,y_pred_RandomForest))
print(classification_report(y_validation,y_pred_RandomForest))

DecisionTreeClassifier:

from sklearn.tree import DecisionTreeClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_DecisionTree_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# DecisionTree
if exist_model==False:
  model_DecisionTree=DecisionTreeClassifier()
  # define the ovr strategy
  ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
  ovr_DecisionTree.fit(x_train, y_train)
  ovr_DecisionTree=save_model(ovr_DecisionTree,path_name)
else:
  model_DecisionTree=DecisionTreeClassifier()
  ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
  ovr_DecisionTree=load_model(path_name, ovr_DecisionTree)
y_pred_DecisionTree=ovr_DecisionTree.predict(x_validation)
print(accuracy_score(y_validation,y_pred_DecisionTree))
print(classification_report(y_validation,y_pred_DecisionTree))

kNN:

from sklearn.neighbors import KNeighborsClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_KNearestNeighbors_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# kNN
if exist_model==False:
  model_kNN=KNeighborsClassifier()
  # define the ovr strategy
  ovr_kNN = OneVsRestClassifier(model_kNN)
  ovr_kNN.fit(x_train, y_train)
  ovr_kNN=save_model(ovr_kNN,path_name)
else:
  model_kNN=KNeighborsClassifier()
  ovr_kNN = OneVsRestClassifier(model_kNN)
  ovr_kNN=load_model(path_name, ovr_kNN)
y_pred_kNN=ovr_kNN.predict(x_validation)
print(accuracy_score(y_validation,y_pred_kNN))
print(classification_report(y_validation,y_pred_kNN))

AdaBoost Classifier:

from sklearn.ensemble import AdaBoostClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_AdaBoostClassifier_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# AdaBoostClassifier
if exist_model==False:
  model_AdaBoost=AdaBoostClassifier()
  # define the ovr strategy
  ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
  ovr_AdaBoost.fit(x_train, y_train)
  ovr_AdaBoost=save_model(ovr_AdaBoost,path_name)
else:
  model_AdaBoost=AdaBoostClassifier()
  ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
  ovr_AdaBoost=load_model(path_name, ovr_AdaBoost)
y_pred_AdaBoost=ovr_AdaBoost.predict(x_validation)
print(accuracy_score(y_validation,y_pred_AdaBoost))
print(classification_report(y_validation,y_pred_AdaBoost))

Neural Networks:

from tensorflow import keras
#Costruiamo il network model
def Neural_network_model():
  model = keras.Sequential([
    keras.layers.Dense(32,input_dim=21,activation='relu'),
    keras.layers.Dense(16,activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(8,activation='relu'),
    keras.layers.Dense(3,activation='softmax')
  ])
  return model

model=Neural_network_model()

#Compila il modello model.compile
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
y_train_enc=pd.get_dummies(y_train)
y_validation_enc=pd.get_dummies(y_validation)
y_test_enc=pd.get_dummies(y_test)

epochs=500
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_NNClassifier_Scaler_"+str(epochs)+"_ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)

# NeuralNet
if exist_model==False:
  history=model.fit(x_train,y_train_enc, epochs=epochs,validation_split=0.2)
  history=save_model(history,path_name)
else:
  history=load_model(path_name, model)

plt.plot(history.history['accuracy'],label="accuracy")
plt.plot(history.history['val_accuracy'],label="val_accuracy")
plt.legend(loc="upper left")
plt.title('Accuracy vs Val_accuracy')
plt.show()

plt.plot(history.history['loss'],label="loss")
plt.plot(history.history['val_loss'],label="val_loss")
plt.legend(loc="upper left")
plt.title('Loss vs Val_loss')
plt.show()

y_pred_NN=model.predict(x_validation)
y_pred_NN=np.argmax(y_pred_NN,axis=1)
y_pred_NN_enc=pd.get_dummies(y_pred_NN)
print(len(y_pred_NN_enc.columns))
if len(y_pred_NN_enc.columns)<=1:
  y_pred_NN_enc[2]=0
  y_pred_NN_enc[3]=0
if len(y_pred_NN_enc.columns)<=2:
  y_pred_NN_enc[3]=0
print(accuracy_score(y_validation_enc,y_pred_NN_enc))
print(classification_report(y_validation_enc,y_pred_NN_enc))

Confronto su un nuovo dataset di Test per determinare il classificatore migliore:

ris_model=dict()

y_pred_SVC_test=ovr_SVC.predict(x_test)
acc_prediction_SVC=accuracy_score(y_test,y_pred_SVC_test)
classification_report_prediction_SVC=classification_report(y_test,y_pred_SVC_test)
weighted_avg_SVC=float(classification_report_prediction_SVC.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nSVC\naccuracy:",acc_prediction_SVC,"\nweighted avg:",weighted_avg_SVC)
print(classification_report_prediction_SVC)
ris_model['SVC']=weighted_avg_SVC

y_pred_RandomForest_test=ovr_RandomForest.predict(x_test)
acc_prediction_RandomForest=accuracy_score(y_test,y_pred_RandomForest_test)
classification_report_prediction_RandomForest=classification_report(y_test,y_pred_RandomForest_test)
weighted_avg_RandomForest=float(classification_report_prediction_RandomForest.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nRandomForest\naccuracy:",acc_prediction_RandomForest,"\nweighted avg:",weighted_avg_RandomForest)
print(classification_report_prediction_RandomForest)
ris_model['RandomForest']=weighted_avg_RandomForest

y_pred_DecisionTree_test=ovr_DecisionTree.predict(x_test)
acc_prediction_DecisionTree=accuracy_score(y_test,y_pred_DecisionTree_test)
classification_report_prediction_DecisionTree=classification_report(y_test,y_pred_DecisionTree_test)
weighted_avg_DecisionTree=float(classification_report_prediction_DecisionTree.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nDecisionTree\naccuracy:",acc_prediction_DecisionTree,"\nweighted avg:",weighted_avg_DecisionTree)
print(classification_report_prediction_DecisionTree)
ris_model['DecisionTree']=weighted_avg_DecisionTree

y_pred_kNN_test=ovr_kNN.predict(x_test)
acc_prediction_kNN=accuracy_score(y_test,y_pred_kNN_test)
classification_report_prediction_kNN=classification_report(y_test,y_pred_kNN_test)
weighted_avg_kNN=float(classification_report_prediction_kNN.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nkNN\naccuracy:",acc_prediction_kNN,"\nweighted avg:",weighted_avg_kNN)
print(classification_report_prediction_kNN)
ris_model['kNN']=weighted_avg_kNN

y_pred_AdaBoost_test=ovr_AdaBoost.predict(x_test)
acc_prediction_AdaBoost=accuracy_score(y_test,y_pred_AdaBoost_test)
classification_report_prediction_AdaBoost=classification_report(y_test,y_pred_AdaBoost_test)
weighted_avg_AdaBoost=float(classification_report_prediction_AdaBoost.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nAdaBoost\naccuracy:",acc_prediction_AdaBoost,"\nweighted avg:",weighted_avg_AdaBoost)
print(classification_report_prediction_AdaBoost)
ris_model['AdaBoost']=weighted_avg_AdaBoost

y_pred_NN_test=model.predict(x_test)
y_pred_NN_test=np.argmax(y_pred_NN_test,axis=1)
y_pred_NN_test_enc=pd.get_dummies(y_pred_NN_test)
if len(y_pred_NN_test_enc.columns)<=1:
  y_pred_NN_test_enc[2]=0
  y_pred_NN_test_enc[3]=0
if len(y_pred_NN_test_enc.columns)<=2:
  y_pred_NN_test_enc[3]=0
acc_prediction_NN=accuracy_score(y_test_enc,y_pred_NN_test_enc)
classification_report_prediction_NN=classification_report(y_test_enc,y_pred_NN_test_enc)
weighted_avg_NN=float(classification_report_prediction_NN.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nNeuralNetwork\naccuracy:",acc_prediction_NN,"\nweighted avg:",weighted_avg_NN)
print(classification_report_prediction_NN)
ris_model['NeuralNetwork']=weighted_avg_NN

max_acc_model=max(ris_model.values())
print("Il miglior modello è :",list(ris_model.keys())[list(ris_model.values()).index(max_acc_model)])

OUTPUT:
Il miglior modello è : RandomForest

Ottimizzazione dei modelli di classificazione

Se confrontiamo i rapporti di classificazione di tutti i classificatori che abbiamo provato, possiamo vedere che il classificatore RandomForest ha ottenuto i risultati migliori. Possiamo eseguire l'ottimizzazione degli iperparametri per AdaBoost per migliorare le prestazioni e trovare il miglior set di valori di iperparametri. Userò GridSearchCV per ottimizzare i parametri learning_rate e n_estimators. Come metrica di punteggio, abbiamo scelto "f1_weighted" che assegna un peso di classe in base alla distribuzione delle classi.

RandomForest OPT:

from sklearn.model_selection import GridSearchCV
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_RandomForestClassifier_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)


# RandomForest
if exist_model==False:
  params={
    'estimator__bootstrap': [True, False],
    'estimator__max_depth': [10, 20, None],
    'estimator__max_features': ['auto', 'sqrt'],
    'estimator__min_samples_leaf': [1, 2],
    'estimator__min_samples_split': [2, 4],
    'estimator__n_estimators': [10, 20]
  }
  classes=y_train.unique()
  model_RandomForest=RandomForestClassifier()
  ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
  grid_clf=GridSearchCV(estimator=ovr_RandomForest, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
  grid_clf.fit(x_train,y_train)
  print("the best parameters are: ",grid_clf.best_estimator_)
  # model_RandomForest=grid_clf.best_estimator_

  # # define the ovr strategy
  # ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
  ovr_RandomForest=grid_clf.best_estimator_
  ovr_RandomForest.fit(x_train, y_train)
  ovr_RandomForest=save_model(ovr_RandomForest,path_name)
else:
  model_RandomForest=RandomForestClassifier()
  ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
  ovr_RandomForest=load_model(path_name, ovr_RandomForest)
  
y_pred_RandomForest=ovr_RandomForest.predict(x_validation)
print(accuracy_score(y_validation,y_pred_RandomForest))
print(classification_report(y_validation,y_pred_RandomForest))

SVC OPT:

path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_SVM_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# RandomForest
if exist_model==False:
  params={
    "estimator__C": np.arange(0,6,2),
    "estimator__kernel": ["linear","poly","rbf"],
    "estimator__degree":np.arange(1,5)
  }
  classes=y_train.unique()
  model_SVC=SVC()
  ovr_SVC = OneVsRestClassifier(model_SVC)
  grid_clf=GridSearchCV(estimator=ovr_SVC, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
  grid_clf.fit(x_train,y_train)
  print("the best parameters are: ",grid_clf.best_estimator_)
  
  ovr_SVC = grid_clf.best_estimator_
  ovr_SVC.fit(x_train, y_train)
  ovr_SVC=save_model(ovr_SVC,path_name)
else:
  model_SVC=SVC()
  ovr_SVC = OneVsRestClassifier(model_SVC)
  ovr_SVC=load_model(path_name, ovr_SVC)

y_pred_SVC=ovr_SVC.predict(x_validation)
print(accuracy_score(y_validation,y_pred_SVC))
print(classification_report(y_validation,y_pred_SVC))

ADABOOST OPT:

path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_AdaBoostClassifier_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# AdaBoostClassifier
if exist_model==False:
  params={
    "estimator__n_estimators": np.arange(10,100,10),
    "estimator__learning_rate": [0.05, 0.1, 1]
  }
  classes=y_train.unique()
  model_AdaBoost=AdaBoostClassifier()
  ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
  grid_clf=GridSearchCV(estimator=ovr_AdaBoost, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
  grid_clf.fit(x_train,y_train)
  print("the best parameters are: ",grid_clf.best_estimator_)
  
  ovr_AdaBoost = grid_clf.best_estimator_
  ovr_AdaBoost.fit(x_train, y_train)
  ovr_AdaBoost=save_model(ovr_AdaBoost,path_name)
else:
  model_AdaBoost=AdaBoostClassifier()
  ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
  ovr_AdaBoost=load_model(path_name, ovr_AdaBoost)
  
y_pred_AdaBoost=ovr_AdaBoost.predict(x_validation)
print(accuracy_score(y_validation,y_pred_AdaBoost))
print(classification_report(y_validation,y_pred_AdaBoost))

DecisionTree OPT:

path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_DecisionTree_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# DecisionTree
if exist_model==False:
  params={
    "estimator__criterion": ['gini', 'entropy'],
    "estimator__max_depth": [2,4,6,8,10,12]
  }
  classes=y_train.unique()
  model_DecisionTree=DecisionTreeClassifier()
  ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
  grid_clf=GridSearchCV(estimator=ovr_DecisionTree, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
  grid_clf.fit(x_train,y_train)
  print("the best parameters are: ",grid_clf.best_estimator_)
  
  ovr_DecisionTree = grid_clf.best_estimator_
  ovr_DecisionTree.fit(x_train, y_train)
  ovr_DecisionTree=save_model(ovr_DecisionTree,path_name)
else:
  model_DecisionTree=DecisionTreeClassifier()
  ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
  ovr_DecisionTree=load_model(path_name, ovr_DecisionTree)
  
y_pred_DecisionTree=ovr_DecisionTree.predict(x_validation)
print(accuracy_score(y_validation,y_pred_DecisionTree))
print(classification_report(y_validation,y_pred_DecisionTree))

kNN OPT:

path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_kNN_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# RandomForest
if exist_model==False:
  params={
    "estimator__leaf_size": np.arange(2,26,2),
    "estimator__n_neighbors": np.arange(2,26,2),
    "estimator__p":[1,2]
  }
  classes=y_train.unique()
  model_kNN=KNeighborsClassifier()
  ovr_kNN = OneVsRestClassifier(model_kNN)
  grid_clf=GridSearchCV(estimator=ovr_kNN, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
  grid_clf.fit(x_train,y_train)
  print("the best parameters are: ",grid_clf.best_estimator_)
  
  ovr_kNN = grid_clf.best_estimator_
  ovr_kNN.fit(x_train, y_train)
  ovr_kNN=save_model(ovr_kNN,path_name)
else:
  model_kNN=KNeighborsClassifier()
  ovr_kNN = OneVsRestClassifier(model_kNN)
  ovr_kNN=load_model(path_name, ovr_kNN)
  
y_pred_kNN=ovr_kNN.predict(x_validation)
print(accuracy_score(y_validation,y_pred_kNN))
print(classification_report(y_validation,y_pred_kNN))

Possiamo vedere che le prestazioni del modello sono migliorate con i parametri GridSearch e il RandomForest continua a superare gli altri modelli.

Questo articolo ha discusso le sfide della classificazione multiclasse e ha dimostrato come implementare vari algoritmi per sviluppare modelli di classificazione multiclasse migliori.

final_model=ovr_RandomForest

Predizione dei risultati della prossima giornata di campionato

Utilizzo dei seguenti link:

Conversione da immagine in testo: image-to-text-converter
Screenshot alle seguenti quote:
- bet365: bet365
- Bet&Win: sports.bwin
- Interwetten: interwetten
- Pinnacle: pinnacle
- William Hill: sports.williamhill
- VC Bet: betcalcio

LEGGENDA dei dati utilizzati:

"Date": Data della partita (gg/mm/aa)
"HomeTeam": Squadra di casa
"AwayTeam": Squadra ospite
"B365H": Bet365 quote vittoria in casa
"B365D": Quote di estrazione Bet365
"B365A": Bet365 quote vittoria in trasferta
"BWH": Quote di vincita in casa Bet&Win
"BWD": Quote di estrazione Bet&Win
"BWA": Quote di vincita in trasferta Bet&Win
"IWH": Quote di vittoria in casa Interwetten
"IWD": Quote di estrazione Interwetten
"IWA": Quote di vincita in trasferta Interwetten
"PSH": Pinnacle quote di vittoria in casa
"PSD": Quote di pareggio Pinnacle
"PSA": Pinnacle quote di vittoria in trasferta
"WHH": William Hill quote vittoria in casa
"WHD": William Hill quote di pareggio
"WHA": William Hill in trasferta quote vittoria
"VCH": VC Bet quote di vincita in casa
"VCD": Vc Bet quote di estrazione
"VCA": VC Bet quote di vincita in trasferta

Previsione sulle prossime partite dati dati attuali

Prendi manualmente i dati attuali delle quote delle agenzie di scommesse, a causa dei permessi dei siti di scommesse impossibili da estrarre.

Bet365=[[2.30,3.20,3.10],
        [3.20,3.20,2.25],
        [1.60,4.00,5.25],
        [1.30,5.50,8.50],
        [2.10,3.50,3.30],
        [2.45,3.20,2.90],
        [1.30,5.25,9.50],
        [5.25,4.00,1.60],
        [5.00,3.75,1.70],
        [3.60,3.25,2.05]]

Bet_and_Win=[[2.30,3.30,3.10],
        [3.20,3.30,2.30],
        [1.60,4.20,5.00],
        [1.32,5.50,8.75],
        [2.15,3.60,3.20],
        [2.40,3.30,2.90],
        [1.30,5.75,9.75],
        [5.00,4.10,1.62],
        [5.00,3.90,1.68],
        [3.60,3.40,2.05]]

Interwetten=[[2.35,3.30,3.05],
        [3.20,3.25,2.30],
        [1.63,4.20,5.25],
        [1.33,5.75,8.25],
        [2.15,3.55,3.25],
        [2.50,3.75,2.90],
        [1.30,5.75,9.50],
        [5.50,4.10,1.60],
        [4.90,3.80,1.70],
        [3.65,3.30,2.10]]

Pinnacle=[[2.480,3.330,3.140],
        [3.310,3.320,2.390],
        [1.675,4.260,5.170],
        [1.317,5.780,10.010],
        [2.600,3.120,3.100],
        [2.210,3.580,3.380],
        [1.298,5.760,11.400],
        [6.070,4.120,1.602],
        [5.270,3.720,1.746],
        [3.890,3.300,2.140]]

William_Hill=[[2.30,3.30,3.10],
        [3.20,3.20,2.30],
        [1.60,4.00,5.50],
        [1.32,5.25,9.50],
        [2.10,3.60,3.30],
        [2.45,3.20,3.00],
        [1.28,5.25,11.00],
        [5.50,3.90,1.60],
        [5.00,3.70,1.70],
        [3.75,3.25,2.05]]    

VC_Bet=[[2.49,3.32,3.19],
        [3.36,3.36,2.38],
        [1.70,4.34,5.50],
        [1.34,6.05,9.50],
        [2.18,3.78,3.40],
        [2.63,3.25,3.15],
        [1.00,5.90,10.00],
        [5.75,4.32,1.65],
        [5.30,4.00,1.77],
        [4.06,3.32,2.16]]

quote=[Bet365,Bet_and_Win,Interwetten,Pinnacle,William_Hill,VC_Bet]
ds_pred={'Empoli - Monza':{'ora':'15:00','data':'15/10/2022'}
         ,'Torino - Juventus':{'ora':'18:00','data':'15/10/2022'}
         ,'Atalanta - Sassuolo':{'ora':'20:45' ,'data':'15/10/2022'}
         ,'Inter - Salernitana':{'ora':'11:30','data':'16/10/2022'}
         ,'Lazio - Udinese':{'ora':'15:00','data':'16/10/2022'}
         ,'Spezia - Cremonese':{'ora':'15:00','data':'16/10/2022'}
         ,'Napoli - Bologna':{'ora':'18:00','data':'16/10/2022'}
         ,'Verona - Milan':{'ora':'20:45','data':'16/10/2022'}
         ,'Sampdoria - Roma':{'ora':'18:30','data':'17/10/2022'}
         ,'Lecce - Fiorentina':{'ora':'20:45','data':'17/10/2022'}}
new_ds={"Date":[],"HomeTeam":[],"AwayTeam":[],"B365H":[],"B365D":[],"B365A":[],"BWH":[],"BWD":[],"BWA":[],"IWH":[],"IWD":[],"IWA":[],"PSH":[],"PSD":[],"PSA":[],"WHH":[],"WHD":[],"WHA":[],"VCH":[],"VCD":[],"VCA":[]}
lista_matches=list(ds_pred.keys())
n_match=len(Bet365)
for n in range(n_match):
  date_string=ds_pred[lista_matches[n]]['data']
  new_ds["Date"].append( time.mktime(datetime.datetime.strptime(date_string, "%d/%m/%Y").timetuple()))

  new_ds['HomeTeam'].append(lista_matches[n].split(' - ')[0])
  new_ds['AwayTeam'].append(lista_matches[n].split(' - ')[1])
  
  new_ds['B365H'].append(quote[0][n][0])
  new_ds['B365D'].append(quote[0][n][1])
  new_ds['B365A'].append(quote[0][n][2])
  
  new_ds['BWH'].append(quote[1][n][0])
  new_ds['BWD'].append(quote[1][n][1])
  new_ds['BWA'].append(quote[1][n][2])
  
  new_ds['IWH'].append(quote[2][n][0])
  new_ds['IWD'].append(quote[2][n][1])
  new_ds['IWA'].append(quote[2][n][2])
  
  new_ds['PSH'].append(quote[3][n][0])
  new_ds['PSD'].append(quote[3][n][1])
  new_ds['PSA'].append(quote[3][n][2])
  
  new_ds['WHH'].append(quote[4][n][0])
  new_ds['WHD'].append(quote[4][n][1])
  new_ds['WHA'].append(quote[4][n][2])
  
  new_ds['VCH'].append(quote[5][n][0])
  new_ds['VCD'].append(quote[5][n][1])
  new_ds['VCA'].append(quote[5][n][2])

new_pred_ds=pd.DataFrame(new_ds)
new_pred_ds_valid=new_pred_ds[['HomeTeam','AwayTeam']]
for k in team_dict:
  new_pred_ds['HomeTeam'][new_pred_ds['HomeTeam']==k]=team_dict[k]
  new_pred_ds['AwayTeam'][new_pred_ds['AwayTeam']==k]=team_dict[k]
new_pred_ds

new_pred_ds_scaler=pd.DataFrame(scaler.transform(new_pred_ds))
new_pred_ds_scaler_pred=final_model.predict(new_pred_ds_scaler)
print(new_pred_ds_scaler_pred)
new_pred_ds_valid['Quota']=new_pred_ds_scaler_pred
new_pred_ds_valid

Predizione dei risultati calcistici di Serie A e Serie B

DecisionTree OPT:

Post recenti

Comments