¿Cómo puedo devolver las tasas de precisión para las predicciones Top N utilizando el SGDClassifier de sklearn?

Estoy tratando de modificar los resultados en esta publicación (Cómo obtener las predicciones Top 3 o Top N usando el SGDClassifier de sklearn) para obtener el índice de precisión, sin embargo, obtengo un índice de precisión de cero y no puedo entender por qué. ¿Alguna idea? ¡Cualquier pensamiento / edición sería muy apreciado! Gracias.

from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from sklearn import linear_model arr=['dogs cats lions','apple pineapple orange','water fire earth air', 'sodium potassium calcium'] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(arr) feature_names = vectorizer.get_feature_names() Y = ['animals', 'fruits', 'elements','chemicals'] T=["eating apple roasted in fire and enjoying fresh air"] test = vectorizer.transform(T) clf = linear_model.SGDClassifier(loss='log') clf.fit(X,Y) x=clf.predict(test) def top_n_accuracy(probs, test, n): best_n = np.argsort(probs, axis=1)[:,-n:] ts = np.argmax(test, axis=1) successes = 0 for i in range(ts.shape[0]): if ts[i] in best_n[i,:]: successes += 1 return float(successes)/ts.shape[0] n=2 probs = clf.predict_proba(test) top_n_accuracy(probs, test, n) 

 from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from sklearn import linear_model arr=['dogs cats lions','apple pineapple orange','water fire earth air', 'sodium potassium calcium'] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(arr) feature_names = vectorizer.get_feature_names() Y = ['animals', 'fruits', 'elements','chemicals'] T=["eating apple roasted in fire and enjoying fresh air", "I love orange"] test = vectorizer.transform(T) clf = linear_model.SGDClassifier(loss='log') clf.fit(X,Y) x=clf.predict(test) n=2 probs = clf.predict_proba(test) topn = np.argsort(probs, axis = 1)[:,-n:] 

Aquí presento el vector de la etiqueta de la verdad fundamental (estos son índices numéricos, debe asignar [“elementos”, etc.) a [0,1,2, etc.]. Aquí asumí que su ejemplo de prueba pertenece a elementos.

 y_true = np.array([2,1]) 

Esto debería calcular su exactitud.

 np.mean(np.array([1 if y_true[k] in topn[k] else 0 for k in range(len(topn))])) 

Terminé resolviendo esto, aunque un poco diferente a lo anterior …

 # Set Data Location: data = 'top10000.csv' # load the data df = pd.read_csv(data,low_memory=False,thousands=',', encoding='latin-1') df = df.dropna() df = df[['CODE','DUTIES']] #select only these columns #df = df.rename(index=float, columns={"CODE": "label", "DUTIES": "text"}) df = df.rename(columns={"CODE": "label", "DUTIES": "text"}) #Convert label to float so you don't need to encode for processing later on df['label']=df['label'].str.replace('-', '',regex=True, case = False).str.strip() df['label']=df['label'].str.replace('.', '',regex=True) #df['label']=pd.to_numeric(df['label']) df['label']=df['label'].str[1:].astype(int) #df['label'].astype('float64', raise_on_error = True) #split data into testing and training train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df.text, df.label,test_size=0.33, random_state=6) #reset the index valid_y = valid_y.reset_index(drop=True) valid_x = valid_x.reset_index(drop=True) # We will also copy the validation datasets to a dataframe to be able to merge later on valid_x_df = pd.DataFrame(valid_x) valid_y_df = pd.DataFrame(valid_y) # Extracte features count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(train_x) X_test_counts = count_vect.transform(valid_x) # Define the model training and validation function def TV_model(classifier, feature_vector_train, label, feature_vector_valid, valid_y, valid_x, is_neural_net=False): # fit the training dataset on the classifier classifier.fit(feature_vector_train, label) # predict the top n labels on validation dataset n = 5 #classifier.probability = True probas = classifier.predict_proba(feature_vector_valid) predictions = classifier.predict(feature_vector_valid) #Identify the indexes of the top predictions top_n_predictions = np.argsort(probas, axis = 1)[:,-n:] #then find the associated SOC code for each prediction top_class = classifier.classes_[top_n_predictions] #cast to a new dataframe top_class_df = pd.DataFrame(data=top_class) #merge it up with the validation labels and descriptions results = pd.merge(valid_y, valid_x, left_index=True, right_index=True) results = pd.merge(results, top_class_df, left_index=True, right_index=True) # Top 5 results condiions and choices top5_conditions = [ (results.iloc[:,0] == results[0]), (results.iloc[:,0] == results[1]), (results.iloc[:,0] == results[2]), (results.iloc[:,0] == results[3]), (results.iloc[:,0] == results[4])] top5_choices = [1, 1, 1, 1, 1] # Fetch Top 1 Result top1_conditions = [(results.iloc[:,0] == results[4])] top1_choices = [1] # Create the success columns results['Top 5 Successes'] = np.select(top5_conditions, top5_choices, default=0) results['Top 1 Successes'] = np.select(top1_conditions, top1_choices, default=0) #Print the QA print("Are Top 5 Results greater than Top 1 Result? (answer must be True): ", (sum(results['Top 5 Successes'])/results.shape[0])>(metrics.accuracy_score(valid_y, predictions))) print("Are Top 1 Results equal from predict() and predict_proba()? (answer must be True): ", (sum(results['Top 1 Successes'])/results.shape[0])==(metrics.accuracy_score(valid_y, predictions))) print(" ") print("Details: ") print("Top 5 Accuracy Rate (predict_proba)= ", sum(results['Top 5 Successes'])/results.shape[0]) #print("Top 5 Accuracy Rate (np.mean)= ", np.mean(np.array([1 if valid_y[k] in top_class[k] else 0 for k in range(len(top_class))]))) print("Top 1 Accuracy Rate (predict_proba)= ", sum(results['Top 1 Successes'])/results.shape[0]) print("Top 1 Accuracy Rate = (predict)", metrics.accuracy_score(valid_y, predictions)) # Train and validate model from example data using the function defined above TV_model(LogisticRegression(), X_train_counts, train_y, X_test_counts, valid_y_df, valid_x_df) 

Estoy seguro de que podría ser más eficiente desde el punto de vista de la computación, por lo que cualquier sugerencia sobre cómo podría transformar el cálculo de la tasa de precisión en una línea como se sugirió en los comentarios anteriores sería muy apreciada.