Import texthero ... 
import texthero as hero
import pandas as pd
... load any text dataset with Pandas
df = pd.read_csv(
    "https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv"
)
df.head(2)
|  | text | topic | 
|---|
| 0 | Claxton hunting first major medal\n\nBritish h... | athletics | 
|---|
| 1 | O'Sullivan could run in Worlds\n\nSonia O'Sull... | athletics | 
|---|
Preprocess it ...
df['text'] = hero.clean(df['text'])
|  | text | topic | 
|---|
| 0 | claxton hunting first major medal british hurd... | athletics | 
|---|
| 1 | sullivan could run worlds sonia sullivan indic... | athletics | 
|---|
... represent it
df['tfidf'] = (
    hero.tfidf(df['text'], max_features=100)
)
df[["tfidf", "topic"]].head(2)
                        
|  | tfidf | topic | 
|---|
| 0 | [0.0, 0.13194458247285848, 0.0, 0.0, 0.0, 0.0,... | athletics | 
|---|
| 1 | [0.0, 0.13056235989725676, 0.0, 0.205187581391... | athletics | 
|---|
Reduce dimension and visualize the vector space
df['pca'] = hero.pca(df['tfidf'])
hero.scatterplot(
    df, 
    col='pca', 
    color='topic', 
    title="PCA BBC Sport news"
)

... need more? find named entities
df['named_entities'] = (
    hero.named_entities(df['text']
)
df[['named_entities', 'topic']].head(2)
                        
|  | named_entities | topic | 
|---|
| 0 | [(claxton, ORG, 0, 7), (first, ORDINAL, 16, 21... | athletics | 
|---|
| 1 | [(sullivan, ORG, 0, 8), (sonia sullivan, PERSO... | athletics | 
|---|
Show top words ...
NUM_TOP_WORDS = 5
hero.top_words(df['text'])[:NUM_TOP_WORDS]
                        
|  | text | 
|---|
| said | 1338 | 
|---|
| first | 790 | 
|---|
| england | 749 | 
|---|
| game | 681 | 
|---|
| one | 671 | 
|---|
And much more !