## regular expressions
import re

## data and text mining functionality
import nltk
# data
from nltk.corpus import inaugural
# list of stopwords
from nltk.corpus import stopwords 
# tokenizer
from nltk.tokenize import word_tokenize
# Lemmatizer/Stemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

## Data analysis and visualization
import pandas as pd


speeches = inaugural.fileids()
speeches

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1977-Carter.txt',
 '1981-Reagan.txt',
 '1985-Reagan.txt',
 '1989-Bush.txt',
 '1993-Clinton.txt',
 '1997-Clinton.txt',
 '2001-Bush.txt',
 '2005-Bush.txt',
 '2009-Obama.txt',
 '2013-Obama.txt',
 '2017-Trump.txt']


years_lst = [re.search('(\d{4})-', speech).group(1) for speech in speeches]
years_lst[0:10]

['1789',
 '1793',
 '1797',
 '1801',
 '1805',
 '1809',
 '1813',
 '1817',
 '1821',
 '1825']


presidents_lst = [re.search('-([A-Za-z]*)\.', speech).group(1) for speech in speeches]
presidents_lst[0:10]

['Washington',
 'Washington',
 'Adams',
 'Jefferson',
 'Jefferson',
 'Madison',
 'Madison',
 'Monroe',
 'Monroe',
 'Adams']


Obama_2013_raw = inaugural.raw("2013-Obama.txt")
Obama_2013_raw[0:500]

'Thank you. Thank you so much.\n\nVice President Biden, Mr. Chief Justice, Members of the United States Congress, distinguished guests, and fellow citizens:\n\nEach time we gather to inaugurate a President we bear witness to the enduring strength of our Constitution. We affirm the promise of our democracy. We recall that what binds this Nation together is not the colors of our skin or the tenets of our faith or the origins of our names. What makes us exceptionalâ\x80\x94what makes us Americanâ\x80\x94is our alle'


Obama_2013_str = Obama_2013_raw.replace("â\x80\x94", " - ")
Obama_2013_str[0:500]

'Thank you. Thank you so much.\n\nVice President Biden, Mr. Chief Justice, Members of the United States Congress, distinguished guests, and fellow citizens:\n\nEach time we gather to inaugurate a President we bear witness to the enduring strength of our Constitution. We affirm the promise of our democracy. We recall that what binds this Nation together is not the colors of our skin or the tenets of our faith or the origins of our names. What makes us exceptional - what makes us American - is our alle'


Obama_2013_lowcase_str = "".join([character.lower() for character in Obama_2013_str])
Obama_2013_lowcase_str[0:500]

'thank you. thank you so much.\n\nvice president biden, mr. chief justice, members of the united states congress, distinguished guests, and fellow citizens:\n\neach time we gather to inaugurate a president we bear witness to the enduring strength of our constitution. we affirm the promise of our democracy. we recall that what binds this nation together is not the colors of our skin or the tenets of our faith or the origins of our names. what makes us exceptional - what makes us american - is our alle'


Obama_2013_words = nltk.word_tokenize(Obama_2013_lowcase_str)
Obama_2013_words[0:25]

['thank',
 'you',
 '.',
 'thank',
 'you',
 'so',
 'much',
 '.',
 'vice',
 'president',
 'biden',
 ',',
 'mr.',
 'chief',
 'justice',
 ',',
 'members',
 'of',
 'the',
 'united',
 'states',
 'congress',
 ',',
 'distinguished',
 'guests']


stop_words = list(stopwords.words("English"))
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


len(stop_words)

179


set([words for words in Obama_2013_words if len(words)==1])

{',', '-', '.', '4', ':', ';', 'a', 'i'}


stop_words_to_add = [",", ".", ":", ";", "-"]
stop_words_extended = set(stop_words + stop_words_to_add)
len(stop_words_extended)

184


Obama_2013_clean = [words for words in Obama_2013_words if words not in stop_words_extended]
Obama_2013_clean[0:25]

['thank',
 'thank',
 'much',
 'vice',
 'president',
 'biden',
 'mr.',
 'chief',
 'justice',
 'members',
 'united',
 'states',
 'congress',
 'distinguished',
 'guests',
 'fellow',
 'citizens',
 'time',
 'gather',
 'inaugurate',
 'president',
 'bear',
 'witness',
 'enduring',
 'strength']


nltk.FreqDist(Obama_2013_clean)

FreqDist({'us': 21, 'must': 17, 'people': 11, "'s": 11, 'time': 10, 'america': 8, 'together': 7, 'country': 7, 'make': 7, 'every': 7, ...})


Obama_2013_freq_dist = nltk.FreqDist(Obama_2013_clean)
Obama_2013_freq_dist["america"]

8


def clean_text(corpus, file_id, stop_words_list):
    """
    This method cleans a text and returns a list of (tokenized) words. In particular it
    - removes stopwords
    - normalizes words to lower case
    
    Input: 
    - name of corpus
    - name of file (file_id in NLTK)
    - list of stop words
    
    Output: cleaned text as a list
    """
    text_raw = corpus.raw(file_id)
    text_str = text_raw.replace("â\x80\x94", " - ")
    text_lowcase_str = "".join([character.lower() for character in text_str])
    text_words = nltk.word_tokenize(text_lowcase_str)
    text_clean = [words for words in text_words if words not in stop_words_list]
    return text_clean


test = clean_text(inaugural, "2013-Obama.txt", stop_words_extended)
test[0:15]

['thank',
 'thank',
 'much',
 'vice',
 'president',
 'biden',
 'mr.',
 'chief',
 'justice',
 'members',
 'united',
 'states',
 'congress',
 'distinguished',
 'guests']


Washington_1789_clean = clean_text(inaugural, "1789-Washington.txt", stop_words_extended)
Washington_1789_freq_dist = nltk.FreqDist(Washington_1789_clean)
Washington_1789_freq_dist["america"]

0


Washington_1789_freq_dist["american"]

2


# initializing lists for word frequencies
america_frq_lst = []
constitution_frq_lst = []

# clean texts and append word frequencies to lists
for speech in speeches:
    text_clean_lst = clean_text(inaugural, speech, stop_words_extended)
    america_frq_lst.append(nltk.FreqDist(text_clean_lst)["america"])
    constitution_frq_lst.append(nltk.FreqDist(text_clean_lst)["constitution"])


## creating data frame

# as dictionary
freq_dict = {"Year": years_lst, 
             "President": presidents_lst, 
             "America": america_frq_lst, 
             "Constitution": constitution_frq_lst}
# as data frame
freq_df = pd.DataFrame(data = freq_dict, index = years_lst)

freq_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, 1789 to 2017
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Year          58 non-null     object
 1   President     58 non-null     object
 2   America       58 non-null     int64 
 3   Constitution  58 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 2.3+ KB


freq_df.describe()


freq_df.plot.line()

<AxesSubplot:>


test_words = ["constitution", "constitutions", "constitutional"]
test_lemmatizer = WordNetLemmatizer()
test_lemmatized_words = [test_lemmatizer.lemmatize(word) for word in test_words]
print("Lemma: ", test_lemmatized_words)
#Create instances of both stemmers, and stem the words using them
stemmer_ps = PorterStemmer()  
#an instance of Porter Stemmer
stemmed_words_ps = [stemmer_ps.stem(word) for word in test_words]
print("Porter Stemmer: ", stemmed_words_ps)
stemmer_sb = SnowballStemmer("english")   
#an instance of Snowball Stemmer
stemmed_words_sb = [stemmer_sb.stem(word) for word in test_words]
print("Snowball Stemmer: ", stemmed_words_sb)

Lemma:  ['constitution', 'constitution', 'constitutional']
Porter Stemmer:  ['constitut', 'constitut', 'constitut']
Snowball Stemmer:  ['constitut', 'constitut', 'constitut']


test_words = ["america", "american", "americans"]
test_lemmatizer = WordNetLemmatizer()
test_lemmatized_words = [test_lemmatizer.lemmatize(word) for word in test_words]
print("Lemma: ", test_lemmatized_words)
#Create instances of both stemmers, and stem the words using them.
stemmer_ps = PorterStemmer()  
#an instance of Porter Stemmer
stemmed_words_ps = [stemmer_ps.stem(word) for word in test_words]
print("Porter Stemmer: ", stemmed_words_ps)
stemmer_sb = SnowballStemmer("english")   
#an instance of Snowball Stemmer
stemmed_words_sb = [stemmer_sb.stem(word) for word in test_words]
print("Snowball Stemmer: ", stemmed_words_sb)

Lemma:  ['america', 'american', 'american']
Porter Stemmer:  ['america', 'american', 'american']
Snowball Stemmer:  ['america', 'american', 'american']


test_words = ["America", "American", "Americans"]
test_lemmatizer = WordNetLemmatizer()
test_lemmatized_words = [test_lemmatizer.lemmatize(word) for word in test_words]
print("Lemma: ", test_lemmatized_words)
#Create instances of both stemmers, and stem the words using them.
stemmer_ps = PorterStemmer()  
#an instance of Porter Stemmer
stemmed_words_ps = [stemmer_ps.stem(word) for word in test_words]
print("Porter Stemmer: ", stemmed_words_ps)
stemmer_sb = SnowballStemmer("english")   
#an instance of Snowball Stemmer
stemmed_words_sb = [stemmer_sb.stem(word) for word in test_words]
print("Snowball Stemmer: ", stemmed_words_sb)

Lemma:  ['America', 'American', 'Americans']
Porter Stemmer:  ['america', 'american', 'american']
Snowball Stemmer:  ['america', 'american', 'american']


test_words = ["america", "constitution", "american", "constitutions", "americans", "constitutional"]

test_words_new = [word.replace('americans', 'america') for word in test_words]
test_words_new = [word.replace('american', 'america') for word in test_words_new]

test_words_new

['america',
 'constitution',
 'america',
 'constitutions',
 'america',
 'constitutional']


print("America :", Obama_2013_freq_dist["america"])
print("American:", Obama_2013_freq_dist["american"])
print("Americans:", Obama_2013_freq_dist["americans"])

America : 8
American: 6
Americans: 4


def replace_synonyms_america(text, synonyms, new_word):
    """
    Replaces a list of synonyms
    """
    for index, word in enumerate(text):
        if word in synonyms:
            text[index] = new_word
    return text


synonyms_america = ["americans", "american"]
Obama_2013_new = replace_synonyms_america(Obama_2013_clean, synonyms_america, "america")
        
nltk.FreqDist(Obama_2013_new)["america"]

18


def stem_text(text):
    """
    Stemming of text using SnowballStemmer
    
    Input: list of (cleaned) words
    Ouptut: list of stemmed words
    """
    stemmer_sb = SnowballStemmer("english") 
    stemmed_words_sb = [stemmer_sb.stem(word) for word in text]
    return stemmed_words_sb


synonyms_america = ["americans", "american"]

america_w_synonyms_frq_lst = []
constitution_stmd_frq_lst = []

for speech in speeches:
    text_clean_lst = clean_text(inaugural, speech, stop_words_extended)
    text_clean_replaced = replace_synonyms_america(text_clean_lst, synonyms_america, "america")
    text_final = stem_text(text_clean_replaced)
    america_w_synonyms_frq_lst.append(nltk.FreqDist(text_final)["america"])
    constitution_stmd_frq_lst.append(nltk.FreqDist(text_final)["constitut"])


## creating data frame

column_names = ["Year", "President", "America", "Constitution"]

freq_stmd_syn_dict = {"Year": years_lst, 
                     "President": presidents_lst, 
                     "America": america_w_synonyms_frq_lst, 
                     "Constitution": constitution_stmd_frq_lst}
#
freq_stmd_syn_df = pd.DataFrame(data = freq_stmd_syn_dict, index = years_lst)
freq_stmd_syn_df.describe()


freq_stmd_syn_df.plot.line()

<AxesSubplot:>

	America	Constitution
count	58.000000	58.000000
mean	3.655172	3.551724
std	5.510920	6.500128
min	0.000000	0.000000
25%	0.000000	0.000000
50%	1.000000	1.000000
75%	6.000000	3.750000
max	21.000000	36.000000

	America	Constitution
count	58.000000	58.000000
mean	7.810345	4.931034
std	8.897973	8.273530
min	0.000000	0.000000
25%	1.250000	0.000000
50%	4.500000	2.000000
75%	11.000000	6.000000
max	35.000000	45.000000

Text Mining: Grundlagen¶

Einführung¶

Bibliotheken und Daten laden¶

Meta-Daten¶

Zeichenketten extrahieren (reguläre Ausdrücke)¶

Die Reden¶

Normalisierung¶

Tokenization¶

Irrelevante Wörter entfernen¶

Worthäufigkeiten zählen¶

Hilfsfunktion zur Textaufbereitung¶

Analyse ohne Stemming¶

Analyse mit Stemming¶

Stemming/Lemmatisierung¶

Analyse¶

Weitere Informationen¶