Oggi vi presento un notebook realizzato per la realizzazione di un modello di classificazione che predice il risultato delle partite di calcio della serie A e Serie B.
Ecco il link del notebook: prediction_result_of_football_match.ipynb
Librerie necessarie all'utilizzo
!pip install --upgrade pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import requests
from datetime import date
from bs4 import BeautifulSoup
from google.colab import drive
import math
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
from datetime import datetime
from joblib import dump, load
from pathlib import Path
from sklearn.metrics import mean_squared_error
import time
import datetime
from sklearn.preprocessing import StandardScaler
import sklearn
import urllib
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
drive.mount('/content/drive')
import warnings
warnings.simplefilter("ignore")
Data extraction
Riceverò i dati attraverso il seguente url.
URL = "https://www.football-data.co.uk/italym.php"
resp = requests.get(URL)
print(resp.status_code)
# creazione di un oggetto "soup"
data = BeautifulSoup(resp.content)
def Dataframe_RES(path):
try:
res = urllib.request.urlopen('https://www.football-data.co.uk/'+path)
df = pd.read_csv(res)
return df
except:
error.append('https://www.football-data.co.uk/'+path)
print("An exception occurred")
error=[]
SERIE_A=[]
SERIE_B=[]
for link in data.find_all('a'):
l=link.get('href')
if (str(l)[len(l)-4:]==".csv"):
path=str(link.get('href'))
print(path,path[len(path)-6:len(path)-4])
if (path[len(path)-6:len(path)-4]=='I1'):
SERIE_A.append(Dataframe_RES(path))
print('SERIE_A')
if (path[len(path)-6:len(path)-4]=='I2'):
SERIE_B.append(Dataframe_RES(path))
print('SERIE_B')
print(len(error))
for e in error:
res = urllib.request.urlopen(e)
df = pd.read_csv(res, encoding ='latin1', error_bad_lines=False)
if (e[len(e)-6:len(e)-4]=='I1'):
SERIE_A.append(df)
print('SERIE_A')
if (e[len(e)-6:len(e)-4]=='I2'):
SERIE_B.append(df)
print('SERIE_B')
Note per i dati di calcio
Tutti i dati sono in formato csv, pronti per l'uso all'interno di applicazioni di fogli di calcolo standard. Si prega di notare che alcune abbreviazioni non sono più in uso (in particolare le quote di specifici bookmaker non più utilizzate) e si riferiscono ai dati raccolti nelle stagioni precedenti. Per un elenco aggiornato di quali bookmaker sono inclusi nel set di dati, visitare http://www.football-data.co.uk/matches.php
Per informazioni sulle chiavi del Dataset: README
ds_serieA=SERIE_A[0]
for da in SERIE_A[1:]:
ds_serieA=ds_serieA.append(da, ignore_index=True).reset_index(drop=True)
ds_serieB=SERIE_B[0]
for db in SERIE_B[1:]:
ds_serieB=ds_serieB.append(db, ignore_index=True).reset_index(drop=True)
ds_serieB
matches=ds_serieA.append(ds_serieB, ignore_index=True).reset_index(drop=True)
Data Preparation
x=matches.isnull().sum()
eliminate=[]
count=0
for col in range (len(x)):
if(x[col]>=len(matches)-1000):
count+=1
matches=matches.drop([x.keys()[col]],axis=1)
eliminate.append(x.keys()[col])
print('count=',count)
matches['FTR']
matches['risultato']=matches['FTR']
matches['risultato'][matches['risultato']=="A"]="2"
matches['risultato'][matches['risultato']=="D"]="X"
matches['risultato'][matches['risultato']=="H"]="1"
# matches
dataset=matches[["Date","HomeTeam","AwayTeam","B365H","B365D","B365A","BWH","BWD","BWA","IWH","IWD","IWA","PSH","PSD","PSA","WHH","WHD","WHA","VCH","VCD","VCA","risultato"]]
dataset=dataset.dropna()
dataset
new_date=[]
for date in dataset['Date']:
date_string = str(date)
timestamp=0.0
try:
timestamp = time.mktime(datetime.datetime.strptime(date_string, "%d/%m/%Y").timetuple())
except:
date_string=date_string[:len(date_string)-2]+'20'+date_string[len(date_string)-2:]
timestamp = time.mktime(datetime.datetime.strptime(date_string, "%d/%m/%Y").timetuple())
new_date.append(timestamp)
dataset['Date']=new_date
list_home_team=dataset['HomeTeam'].unique()
list_home_team.sort()
list_away_team=dataset['AwayTeam'].unique()
list_away_team.sort()
print(list_away_team==list_home_team)
team_dict=dict(zip(list_home_team,np.arange(len(list_home_team))))
for k in team_dict:
dataset['HomeTeam'][matches['HomeTeam']==k]=team_dict[k]
dataset['AwayTeam'][matches['AwayTeam']==k]=team_dict[k]
dataset
labels=['1','2','X']
print(dataset['risultato'].value_counts())
X=dataset.drop('risultato',axis=1)
Y=dataset['risultato']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,stratify=Y, random_state=42)
x_validation, x_test, y_validation, y_test = train_test_split(x_test, y_test, test_size=0.33,stratify=y_test, random_state=42)
y_train=y_train.astype('str')
y_validation=y_validation.astype('str')
y_test=y_test.astype('str')
print(y_train.value_counts())
scaler=StandardScaler()
x_train=pd.DataFrame(scaler.fit_transform(x_train))
x_validation=pd.DataFrame(scaler.transform(x_validation))
x_test=pd.DataFrame(scaler.transform(x_test))
MODEL TRAINING
#caricamento
def load_model(model_path, model):
model=load(model_path)
return model
#salvataggio
def save_model(model,model_path):
dump(model,model_path)
return model
def verifica_esistenza_modello(file_name):
esiste=False
try:
my_file = Path(file_name)
if my_file.is_file():
esiste=True
except IOError:
print("File not accessible")
print("Il modello addestrato esiste?",esiste)
return esiste
SVM:
from sklearn.svm import SVC
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_SVM_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# SVC
if exist_model==False:
model_SVC=SVC()
# define the ovr strategy
ovr_SVC = OneVsRestClassifier(model_SVC)
ovr_SVC.fit(x_train, y_train)
ovr_SVC=save_model(ovr_SVC,path_name)
else:
model_SVC=SVC()
ovr_SVC = OneVsRestClassifier(model_SVC)
ovr_SVC=load_model(path_name, ovr_SVC)
y_pred_SVC=ovr_SVC.predict(x_validation)
print(accuracy_score(y_validation,y_pred_SVC))
print(classification_report(y_validation,y_pred_SVC))
RandomForestClassifier:
from sklearn.ensemble import RandomForestClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_RandomForestClassifier_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# RandomForest
if exist_model==False:
model_RandomForest=RandomForestClassifier()
# define the ovr strategy
ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
ovr_RandomForest.fit(x_train, y_train)
ovr_RandomForest=save_model(ovr_RandomForest,path_name)
else:
model_RandomForest=RandomForestClassifier()
ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
ovr_RandomForest=load_model(path_name, ovr_RandomForest)
y_pred_RandomForest=ovr_RandomForest.predict(x_validation)
print(accuracy_score(y_validation,y_pred_RandomForest))
print(classification_report(y_validation,y_pred_RandomForest))
DecisionTreeClassifier:
from sklearn.tree import DecisionTreeClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_DecisionTree_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# DecisionTree
if exist_model==False:
model_DecisionTree=DecisionTreeClassifier()
# define the ovr strategy
ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
ovr_DecisionTree.fit(x_train, y_train)
ovr_DecisionTree=save_model(ovr_DecisionTree,path_name)
else:
model_DecisionTree=DecisionTreeClassifier()
ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
ovr_DecisionTree=load_model(path_name, ovr_DecisionTree)
y_pred_DecisionTree=ovr_DecisionTree.predict(x_validation)
print(accuracy_score(y_validation,y_pred_DecisionTree))
print(classification_report(y_validation,y_pred_DecisionTree))
kNN:
from sklearn.neighbors import KNeighborsClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_KNearestNeighbors_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# kNN
if exist_model==False:
model_kNN=KNeighborsClassifier()
# define the ovr strategy
ovr_kNN = OneVsRestClassifier(model_kNN)
ovr_kNN.fit(x_train, y_train)
ovr_kNN=save_model(ovr_kNN,path_name)
else:
model_kNN=KNeighborsClassifier()
ovr_kNN = OneVsRestClassifier(model_kNN)
ovr_kNN=load_model(path_name, ovr_kNN)
y_pred_kNN=ovr_kNN.predict(x_validation)
print(accuracy_score(y_validation,y_pred_kNN))
print(classification_report(y_validation,y_pred_kNN))
AdaBoost Classifier:
from sklearn.ensemble import AdaBoostClassifier
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_AdaBoostClassifier_Scaler_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# AdaBoostClassifier
if exist_model==False:
model_AdaBoost=AdaBoostClassifier()
# define the ovr strategy
ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
ovr_AdaBoost.fit(x_train, y_train)
ovr_AdaBoost=save_model(ovr_AdaBoost,path_name)
else:
model_AdaBoost=AdaBoostClassifier()
ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
ovr_AdaBoost=load_model(path_name, ovr_AdaBoost)
y_pred_AdaBoost=ovr_AdaBoost.predict(x_validation)
print(accuracy_score(y_validation,y_pred_AdaBoost))
print(classification_report(y_validation,y_pred_AdaBoost))
Neural Networks:
from tensorflow import keras
#Costruiamo il network model
def Neural_network_model():
model = keras.Sequential([
keras.layers.Dense(32,input_dim=21,activation='relu'),
keras.layers.Dense(16,activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(8,activation='relu'),
keras.layers.Dense(3,activation='softmax')
])
return model
model=Neural_network_model()
#Compila il modello model.compile
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
y_train_enc=pd.get_dummies(y_train)
y_validation_enc=pd.get_dummies(y_validation)
y_test_enc=pd.get_dummies(y_test)
epochs=500
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_NNClassifier_Scaler_"+str(epochs)+"_ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# NeuralNet
if exist_model==False:
history=model.fit(x_train,y_train_enc, epochs=epochs,validation_split=0.2)
history=save_model(history,path_name)
else:
history=load_model(path_name, model)
plt.plot(history.history['accuracy'],label="accuracy")
plt.plot(history.history['val_accuracy'],label="val_accuracy")
plt.legend(loc="upper left")
plt.title('Accuracy vs Val_accuracy')
plt.show()
plt.plot(history.history['loss'],label="loss")
plt.plot(history.history['val_loss'],label="val_loss")
plt.legend(loc="upper left")
plt.title('Loss vs Val_loss')
plt.show()
y_pred_NN=model.predict(x_validation)
y_pred_NN=np.argmax(y_pred_NN,axis=1)
y_pred_NN_enc=pd.get_dummies(y_pred_NN)
print(len(y_pred_NN_enc.columns))
if len(y_pred_NN_enc.columns)<=1:
y_pred_NN_enc[2]=0
y_pred_NN_enc[3]=0
if len(y_pred_NN_enc.columns)<=2:
y_pred_NN_enc[3]=0
print(accuracy_score(y_validation_enc,y_pred_NN_enc))
print(classification_report(y_validation_enc,y_pred_NN_enc))
Confronto su un nuovo dataset di Test per determinare il classificatore migliore:
ris_model=dict()
y_pred_SVC_test=ovr_SVC.predict(x_test)
acc_prediction_SVC=accuracy_score(y_test,y_pred_SVC_test)
classification_report_prediction_SVC=classification_report(y_test,y_pred_SVC_test)
weighted_avg_SVC=float(classification_report_prediction_SVC.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nSVC\naccuracy:",acc_prediction_SVC,"\nweighted avg:",weighted_avg_SVC)
print(classification_report_prediction_SVC)
ris_model['SVC']=weighted_avg_SVC
y_pred_RandomForest_test=ovr_RandomForest.predict(x_test)
acc_prediction_RandomForest=accuracy_score(y_test,y_pred_RandomForest_test)
classification_report_prediction_RandomForest=classification_report(y_test,y_pred_RandomForest_test)
weighted_avg_RandomForest=float(classification_report_prediction_RandomForest.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nRandomForest\naccuracy:",acc_prediction_RandomForest,"\nweighted avg:",weighted_avg_RandomForest)
print(classification_report_prediction_RandomForest)
ris_model['RandomForest']=weighted_avg_RandomForest
y_pred_DecisionTree_test=ovr_DecisionTree.predict(x_test)
acc_prediction_DecisionTree=accuracy_score(y_test,y_pred_DecisionTree_test)
classification_report_prediction_DecisionTree=classification_report(y_test,y_pred_DecisionTree_test)
weighted_avg_DecisionTree=float(classification_report_prediction_DecisionTree.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nDecisionTree\naccuracy:",acc_prediction_DecisionTree,"\nweighted avg:",weighted_avg_DecisionTree)
print(classification_report_prediction_DecisionTree)
ris_model['DecisionTree']=weighted_avg_DecisionTree
y_pred_kNN_test=ovr_kNN.predict(x_test)
acc_prediction_kNN=accuracy_score(y_test,y_pred_kNN_test)
classification_report_prediction_kNN=classification_report(y_test,y_pred_kNN_test)
weighted_avg_kNN=float(classification_report_prediction_kNN.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nkNN\naccuracy:",acc_prediction_kNN,"\nweighted avg:",weighted_avg_kNN)
print(classification_report_prediction_kNN)
ris_model['kNN']=weighted_avg_kNN
y_pred_AdaBoost_test=ovr_AdaBoost.predict(x_test)
acc_prediction_AdaBoost=accuracy_score(y_test,y_pred_AdaBoost_test)
classification_report_prediction_AdaBoost=classification_report(y_test,y_pred_AdaBoost_test)
weighted_avg_AdaBoost=float(classification_report_prediction_AdaBoost.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nAdaBoost\naccuracy:",acc_prediction_AdaBoost,"\nweighted avg:",weighted_avg_AdaBoost)
print(classification_report_prediction_AdaBoost)
ris_model['AdaBoost']=weighted_avg_AdaBoost
y_pred_NN_test=model.predict(x_test)
y_pred_NN_test=np.argmax(y_pred_NN_test,axis=1)
y_pred_NN_test_enc=pd.get_dummies(y_pred_NN_test)
if len(y_pred_NN_test_enc.columns)<=1:
y_pred_NN_test_enc[2]=0
y_pred_NN_test_enc[3]=0
if len(y_pred_NN_test_enc.columns)<=2:
y_pred_NN_test_enc[3]=0
acc_prediction_NN=accuracy_score(y_test_enc,y_pred_NN_test_enc)
classification_report_prediction_NN=classification_report(y_test_enc,y_pred_NN_test_enc)
weighted_avg_NN=float(classification_report_prediction_NN.split('weighted avg')[1].split()[2])
print(("_"*55)+"\nNeuralNetwork\naccuracy:",acc_prediction_NN,"\nweighted avg:",weighted_avg_NN)
print(classification_report_prediction_NN)
ris_model['NeuralNetwork']=weighted_avg_NN
max_acc_model=max(ris_model.values())
print("Il miglior modello è :",list(ris_model.keys())[list(ris_model.values()).index(max_acc_model)])
OUTPUT:
Il miglior modello è : RandomForest
Ottimizzazione dei modelli di classificazione
Se confrontiamo i rapporti di classificazione di tutti i classificatori che abbiamo provato, possiamo vedere che il classificatore RandomForest ha ottenuto i risultati migliori. Possiamo eseguire l'ottimizzazione degli iperparametri per AdaBoost per migliorare le prestazioni e trovare il miglior set di valori di iperparametri. Userò GridSearchCV per ottimizzare i parametri learning_rate e n_estimators. Come metrica di punteggio, abbiamo scelto "f1_weighted" che assegna un peso di classe in base alla distribuzione delle classi.
RandomForest OPT:
from sklearn.model_selection import GridSearchCV
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_RandomForestClassifier_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# RandomForest
if exist_model==False:
params={
'estimator__bootstrap': [True, False],
'estimator__max_depth': [10, 20, None],
'estimator__max_features': ['auto', 'sqrt'],
'estimator__min_samples_leaf': [1, 2],
'estimator__min_samples_split': [2, 4],
'estimator__n_estimators': [10, 20]
}
classes=y_train.unique()
model_RandomForest=RandomForestClassifier()
ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
grid_clf=GridSearchCV(estimator=ovr_RandomForest, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
grid_clf.fit(x_train,y_train)
print("the best parameters are: ",grid_clf.best_estimator_)
# model_RandomForest=grid_clf.best_estimator_
# # define the ovr strategy
# ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
ovr_RandomForest=grid_clf.best_estimator_
ovr_RandomForest.fit(x_train, y_train)
ovr_RandomForest=save_model(ovr_RandomForest,path_name)
else:
model_RandomForest=RandomForestClassifier()
ovr_RandomForest = OneVsRestClassifier(model_RandomForest)
ovr_RandomForest=load_model(path_name, ovr_RandomForest)
y_pred_RandomForest=ovr_RandomForest.predict(x_validation)
print(accuracy_score(y_validation,y_pred_RandomForest))
print(classification_report(y_validation,y_pred_RandomForest))
SVC OPT:
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_SVM_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# RandomForest
if exist_model==False:
params={
"estimator__C": np.arange(0,6,2),
"estimator__kernel": ["linear","poly","rbf"],
"estimator__degree":np.arange(1,5)
}
classes=y_train.unique()
model_SVC=SVC()
ovr_SVC = OneVsRestClassifier(model_SVC)
grid_clf=GridSearchCV(estimator=ovr_SVC, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
grid_clf.fit(x_train,y_train)
print("the best parameters are: ",grid_clf.best_estimator_)
ovr_SVC = grid_clf.best_estimator_
ovr_SVC.fit(x_train, y_train)
ovr_SVC=save_model(ovr_SVC,path_name)
else:
model_SVC=SVC()
ovr_SVC = OneVsRestClassifier(model_SVC)
ovr_SVC=load_model(path_name, ovr_SVC)
y_pred_SVC=ovr_SVC.predict(x_validation)
print(accuracy_score(y_validation,y_pred_SVC))
print(classification_report(y_validation,y_pred_SVC))
ADABOOST OPT:
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_AdaBoostClassifier_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# AdaBoostClassifier
if exist_model==False:
params={
"estimator__n_estimators": np.arange(10,100,10),
"estimator__learning_rate": [0.05, 0.1, 1]
}
classes=y_train.unique()
model_AdaBoost=AdaBoostClassifier()
ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
grid_clf=GridSearchCV(estimator=ovr_AdaBoost, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
grid_clf.fit(x_train,y_train)
print("the best parameters are: ",grid_clf.best_estimator_)
ovr_AdaBoost = grid_clf.best_estimator_
ovr_AdaBoost.fit(x_train, y_train)
ovr_AdaBoost=save_model(ovr_AdaBoost,path_name)
else:
model_AdaBoost=AdaBoostClassifier()
ovr_AdaBoost = OneVsRestClassifier(model_AdaBoost)
ovr_AdaBoost=load_model(path_name, ovr_AdaBoost)
y_pred_AdaBoost=ovr_AdaBoost.predict(x_validation)
print(accuracy_score(y_validation,y_pred_AdaBoost))
print(classification_report(y_validation,y_pred_AdaBoost))
DecisionTree OPT:
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_DecisionTree_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# DecisionTree
if exist_model==False:
params={
"estimator__criterion": ['gini', 'entropy'],
"estimator__max_depth": [2,4,6,8,10,12]
}
classes=y_train.unique()
model_DecisionTree=DecisionTreeClassifier()
ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
grid_clf=GridSearchCV(estimator=ovr_DecisionTree, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
grid_clf.fit(x_train,y_train)
print("the best parameters are: ",grid_clf.best_estimator_)
ovr_DecisionTree = grid_clf.best_estimator_
ovr_DecisionTree.fit(x_train, y_train)
ovr_DecisionTree=save_model(ovr_DecisionTree,path_name)
else:
model_DecisionTree=DecisionTreeClassifier()
ovr_DecisionTree = OneVsRestClassifier(model_DecisionTree)
ovr_DecisionTree=load_model(path_name, ovr_DecisionTree)
y_pred_DecisionTree=ovr_DecisionTree.predict(x_validation)
print(accuracy_score(y_validation,y_pred_DecisionTree))
print(classification_report(y_validation,y_pred_DecisionTree))
kNN OPT:
path_name='/content/drive/MyDrive/project/Model/'+"model_prediction_risultato_match"+"_kNN_Scaler_OPT_"+"ep.model"
print("Path =",path_name)
exist_model=verifica_esistenza_modello(path_name)
# RandomForest
if exist_model==False:
params={
"estimator__leaf_size": np.arange(2,26,2),
"estimator__n_neighbors": np.arange(2,26,2),
"estimator__p":[1,2]
}
classes=y_train.unique()
model_kNN=KNeighborsClassifier()
ovr_kNN = OneVsRestClassifier(model_kNN)
grid_clf=GridSearchCV(estimator=ovr_kNN, scoring='f1_weighted',param_grid=params,cv=5,n_jobs=-1,verbose=3)
grid_clf.fit(x_train,y_train)
print("the best parameters are: ",grid_clf.best_estimator_)
ovr_kNN = grid_clf.best_estimator_
ovr_kNN.fit(x_train, y_train)
ovr_kNN=save_model(ovr_kNN,path_name)
else:
model_kNN=KNeighborsClassifier()
ovr_kNN = OneVsRestClassifier(model_kNN)
ovr_kNN=load_model(path_name, ovr_kNN)
y_pred_kNN=ovr_kNN.predict(x_validation)
print(accuracy_score(y_validation,y_pred_kNN))
print(classification_report(y_validation,y_pred_kNN))
Possiamo vedere che le prestazioni del modello sono migliorate con i parametri GridSearch e il RandomForest continua a superare gli altri modelli.
Questo articolo ha discusso le sfide della classificazione multiclasse e ha dimostrato come implementare vari algoritmi per sviluppare modelli di classificazione multiclasse migliori.
final_model=ovr_RandomForest
Predizione dei risultati della prossima giornata di campionato
Utilizzo dei seguenti link:
Conversione da immagine in testo: image-to-text-converter
Screenshot alle seguenti quote:
bet365: bet365
Bet&Win: sports.bwin
Interwetten: interwetten
Pinnacle: pinnacle
William Hill: sports.williamhill
VC Bet: betcalcio
LEGGENDA dei dati utilizzati:
"Date": Data della partita (gg/mm/aa)
"HomeTeam": Squadra di casa
"AwayTeam": Squadra ospite
"B365H": Bet365 quote vittoria in casa
"B365D": Quote di estrazione Bet365
"B365A": Bet365 quote vittoria in trasferta
"BWH": Quote di vincita in casa Bet&Win
"BWD": Quote di estrazione Bet&Win
"BWA": Quote di vincita in trasferta Bet&Win
"IWH": Quote di vittoria in casa Interwetten
"IWD": Quote di estrazione Interwetten
"IWA": Quote di vincita in trasferta Interwetten
"PSH": Pinnacle quote di vittoria in casa
"PSD": Quote di pareggio Pinnacle
"PSA": Pinnacle quote di vittoria in trasferta
"WHH": William Hill quote vittoria in casa
"WHD": William Hill quote di pareggio
"WHA": William Hill in trasferta quote vittoria
"VCH": VC Bet quote di vincita in casa
"VCD": Vc Bet quote di estrazione
"VCA": VC Bet quote di vincita in trasferta
Previsione sulle prossime partite dati dati attuali
Prendi manualmente i dati attuali delle quote delle agenzie di scommesse, a causa dei permessi dei siti di scommesse impossibili da estrarre.
Bet365=[[2.30,3.20,3.10],
[3.20,3.20,2.25],
[1.60,4.00,5.25],
[1.30,5.50,8.50],
[2.10,3.50,3.30],
[2.45,3.20,2.90],
[1.30,5.25,9.50],
[5.25,4.00,1.60],
[5.00,3.75,1.70],
[3.60,3.25,2.05]]
Bet_and_Win=[[2.30,3.30,3.10],
[3.20,3.30,2.30],
[1.60,4.20,5.00],
[1.32,5.50,8.75],
[2.15,3.60,3.20],
[2.40,3.30,2.90],
[1.30,5.75,9.75],
[5.00,4.10,1.62],
[5.00,3.90,1.68],
[3.60,3.40,2.05]]
Interwetten=[[2.35,3.30,3.05],
[3.20,3.25,2.30],
[1.63,4.20,5.25],
[1.33,5.75,8.25],
[2.15,3.55,3.25],
[2.50,3.75,2.90],
[1.30,5.75,9.50],
[5.50,4.10,1.60],
[4.90,3.80,1.70],
[3.65,3.30,2.10]]
Pinnacle=[[2.480,3.330,3.140],
[3.310,3.320,2.390],
[1.675,4.260,5.170],
[1.317,5.780,10.010],
[2.600,3.120,3.100],
[2.210,3.580,3.380],
[1.298,5.760,11.400],
[6.070,4.120,1.602],
[5.270,3.720,1.746],
[3.890,3.300,2.140]]
William_Hill=[[2.30,3.30,3.10],
[3.20,3.20,2.30],
[1.60,4.00,5.50],
[1.32,5.25,9.50],
[2.10,3.60,3.30],
[2.45,3.20,3.00],
[1.28,5.25,11.00],
[5.50,3.90,1.60],
[5.00,3.70,1.70],
[3.75,3.25,2.05]]
VC_Bet=[[2.49,3.32,3.19],
[3.36,3.36,2.38],
[1.70,4.34,5.50],
[1.34,6.05,9.50],
[2.18,3.78,3.40],
[2.63,3.25,3.15],
[1.00,5.90,10.00],
[5.75,4.32,1.65],
[5.30,4.00,1.77],
[4.06,3.32,2.16]]
quote=[Bet365,Bet_and_Win,Interwetten,Pinnacle,William_Hill,VC_Bet]
ds_pred={'Empoli - Monza':{'ora':'15:00','data':'15/10/2022'}
,'Torino - Juventus':{'ora':'18:00','data':'15/10/2022'}
,'Atalanta - Sassuolo':{'ora':'20:45' ,'data':'15/10/2022'}
,'Inter - Salernitana':{'ora':'11:30','data':'16/10/2022'}
,'Lazio - Udinese':{'ora':'15:00','data':'16/10/2022'}
,'Spezia - Cremonese':{'ora':'15:00','data':'16/10/2022'}
,'Napoli - Bologna':{'ora':'18:00','data':'16/10/2022'}
,'Verona - Milan':{'ora':'20:45','data':'16/10/2022'}
,'Sampdoria - Roma':{'ora':'18:30','data':'17/10/2022'}
,'Lecce - Fiorentina':{'ora':'20:45','data':'17/10/2022'}}
new_ds={"Date":[],"HomeTeam":[],"AwayTeam":[],"B365H":[],"B365D":[],"B365A":[],"BWH":[],"BWD":[],"BWA":[],"IWH":[],"IWD":[],"IWA":[],"PSH":[],"PSD":[],"PSA":[],"WHH":[],"WHD":[],"WHA":[],"VCH":[],"VCD":[],"VCA":[]}
lista_matches=list(ds_pred.keys())
n_match=len(Bet365)
for n in range(n_match):
date_string=ds_pred[lista_matches[n]]['data']
new_ds["Date"].append( time.mktime(datetime.datetime.strptime(date_string, "%d/%m/%Y").timetuple()))
new_ds['HomeTeam'].append(lista_matches[n].split(' - ')[0])
new_ds['AwayTeam'].append(lista_matches[n].split(' - ')[1])
new_ds['B365H'].append(quote[0][n][0])
new_ds['B365D'].append(quote[0][n][1])
new_ds['B365A'].append(quote[0][n][2])
new_ds['BWH'].append(quote[1][n][0])
new_ds['BWD'].append(quote[1][n][1])
new_ds['BWA'].append(quote[1][n][2])
new_ds['IWH'].append(quote[2][n][0])
new_ds['IWD'].append(quote[2][n][1])
new_ds['IWA'].append(quote[2][n][2])
new_ds['PSH'].append(quote[3][n][0])
new_ds['PSD'].append(quote[3][n][1])
new_ds['PSA'].append(quote[3][n][2])
new_ds['WHH'].append(quote[4][n][0])
new_ds['WHD'].append(quote[4][n][1])
new_ds['WHA'].append(quote[4][n][2])
new_ds['VCH'].append(quote[5][n][0])
new_ds['VCD'].append(quote[5][n][1])
new_ds['VCA'].append(quote[5][n][2])
new_pred_ds=pd.DataFrame(new_ds)
new_pred_ds_valid=new_pred_ds[['HomeTeam','AwayTeam']]
for k in team_dict:
new_pred_ds['HomeTeam'][new_pred_ds['HomeTeam']==k]=team_dict[k]
new_pred_ds['AwayTeam'][new_pred_ds['AwayTeam']==k]=team_dict[k]
new_pred_ds
new_pred_ds_scaler=pd.DataFrame(scaler.transform(new_pred_ds))
new_pred_ds_scaler_pred=final_model.predict(new_pred_ds_scaler)
print(new_pred_ds_scaler_pred)
new_pred_ds_valid['Quota']=new_pred_ds_scaler_pred
new_pred_ds_valid
Comments