#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# processing.py
# University of Zurich
# Department of Computational Linguistics
# Authors: Jenifer Leleany Meyer & Florian Heinz
# Matriculation Numbers: 19-919-695 & 19-111-889
import time
from typing import List, Tuple
import re
from random import randint
import csv
dir
class Joke:
"""
creating a class "Joke"
"""
def __init__(self, raw_joke) -> None:
self.raw_joke = raw_joke
def split_into_sentences(self) -> List[str]:
"""
Split text into sentences
"""
big_list = []
small_list = []
a = []
for line in self.raw_joke:
new_string = ""
# iterate through all the characters
for c in self.raw_joke:
# avoid all special characters
if c.isalpha() or c.isdigit() or c in ".,\'\"?!° ()-…’“”":
new_string += c
line = new_string
# split sentences after variations of periods
if ". . ." in line:
a = [line]
elif "..\"" in line:
a = line.split("..\"")
a[0] += "..\""
a[-1] += "..\""
elif ". " in line:
a = line.split(". ")
a[0] += "."
if "?nn" in a[0]:
b = line.split("nn")
small_list.append(a[0])
small_list.extend(b)
elif ".. " in line:
a = line.split("..")
a[0] += ".. "
if "?nn" in a[0]:
b = line.split("nn")
if not small_list == []:
small_list.append(a[0])
small_list.extend(b)
elif "... " in line:
a = line.split("... ")
a[0] += "..."
if "?nn" in a[0]:
b = line.split("nn")
small_list.append(a[0])
small_list.extend(b)
elif "…" in line:
a = line.split("… ")
a[0] += "…"
# split sentences after question mark
elif "? " in line:
a = line.split("? ")
a[0] += "?"
if ")" in a[0]:
b = a[1].split(") ")
b[0] += ")"
small_list.append(a[0])
small_list.extend(b)
# don't lose sentences without punctuation
else:
big_list.append([line])
# solve corner case "?\n\n"
if "?nn" in line:
a = line.split("nn")
# keep sentences that "stand alone"
if not small_list == []:
big_list.append(small_list)
small_list = []
else:
big_list.append(a)
return big_list
def _tokenize(self) -> List[List[str]]:
"""
Tokenize all the words in the sentence
"""
tokenized = []
tokenized_list = []
big_list2 = []
sentences_str = self.split_into_sentences()
# open list of lists containing sentences
for u in sentences_str:
tokenized = []
# iterate through list of sentences
for sentence in u:
# split sentences at whitespaces
words = sentence.split()
# iterate through words and determine where to split
for word in words:
# split "(" from word
if word[0] == "(":
tokenized.append("(")
splitted = word.split("(")
tokenized.append(splitted[1])
# split "“" from word
elif word[0] == "“":
tokenized.append("“")
splitted = word.split("“")
tokenized.append(splitted[1])
# split "\"" from word
elif word[0] == "\"":
tokenized.append("\"")
splitted = word.split("\"")
tokenized.append(splitted[1])
# split "!" from word if word ends with a letter
elif word[-1] == "!" and word[-2] in "abcdefghijklmnopqrstuvwxyz":
splitted = word.split("!")
tokenized.append(splitted[0])
tokenized.append("!")
# split if last character is not a letter, digit, "!" or "-"
elif not word[-1] in "abcdDefghijklmnopqrstuvwxyz0123456789!-,…":
if len(word) == 1:
tokenized.extend(word)
# checks if the two characters before the last one are also not letters, digits, "!" or "-"
elif len(word) > 2 and not word[-2] in "abcdDefghijklmnopqrstuvwxyz0123456789!-,…" and not word[
-3] in "abcdDefghijklmnopqrstuvwxyz0123456789!-,…":
u = word[-3]
v = word[-2]
w = word[-1]
tokenized.append((word.split(u))[0])
tokenized.extend([u, v, w])
# checks if the character before the last one is also not a letter, digit, "!", or "-"
elif len(word) > 1 and not word[-2] in "abcdDefghijklmnopqrstuvwxyz0123456789!-,…":
v = word[-2]
w = word[-1]
tokenized.append((word.split(v))[0])
tokenized.extend([v, w])
else:
w = word[-1]
tokenized.append((word.split(w))[0])
tokenized.append(w)
# else, just add word to the list
else:
tokenized.append(word)
big_list2.append(tokenized)
return big_list2
def filter_profanity(self, filename="profanities.txt") -> Tuple[List[List[str]], int]:
"""
Filter out all the profanity
"""
output = []
# profanity counter
num_profanities = 0
# Read in profanity file and store profanities in a list
with open(filename, "r", encoding="utf-8") as file:
profanities = file.read().split("\n")
lst = []
tokenized = self._tokenize()
# Go over every sentence
for sentence in tokenized:
for prof in profanities:
for token in sentence:
# Check if there is a profanity
if token.lower() in profanities or prof in token.lower():
num_profanities += 1
# Censor
sentence[sentence.index(token)] = len(token) * '#'
# Avoid appending empty lists
if sentence:
lst.append(sentence)
return (lst, num_profanities)
def tell_joke(self) -> None:
"""
Print in a humanly readable way
"""
filtered_string = self.filter_profanity()
# define empty string
output = ""
post_str = ""
# list containing characters the don't need " " infront of them
for sentence in filtered_string:
for i, token in enumerate(sentence[:-1]):
# (i, token)
if token in ",.!?)":
output += token
elif sentence[i + 1] in "('’-–":
output += " " + token
elif token in "('’-–\"":
output += token
elif sentence[i - 1] in "('’-–\"":
output += token
else:
output += token
if sentence[-1] in ",.!?\"":
output += sentence[-1]
else:
output += sentence[-1]
#output += " " + sentence[-1]
# define dimensions for framing
screen_width = 80
text_width = len(output)
box_width = text_width + 6
left_margin = (screen_width - box_width) // 2
#J = Joke(self.raw_joke)
# print frame & sentence
post_str = output.rstrip('\n')
big_list = []
small_list = []
a = []
if ". . ." in post_str:
a = [post_str]
elif "..\"" in post_str:
a = post_str.split("..\"")
a[0] += "..\""
a[-1] += "..\""
elif ". " in post_str:
a = post_str.split(". ")
a[0] += "."
if "?nn" in a[0]:
b = post_str.split("nn")
small_list.append(a[0])
small_list.extend(b)
elif ".. " in post_str:
a = post_str.split("..")
a[0] += ".. "
if "?nn" in a[0]:
b = post_str.split("nn")
if not small_list == []:
small_list.append(a[0])
small_list.extend(b)
elif "... " in post_str:
a = post_str.split("... ")
a[0] += "..."
if "?nn" in a[0]:
b = post_str.split("nn")
small_list.append(a[0])
small_list.extend(b)
elif "…" in post_str:
a = post_str.split("… ")
a[0] += "…"
# split sentences after question mark
elif "? " in post_str:
a = post_str.split("? ")
a[0] += "?"
if ")" in a[0]:
b = a[1].split(") ")
b[0] += ")"
small_list.append(a[0])
small_list.extend(b)
# don't lose sentences without punctuation
else:
big_list.append([post_str])
# solve corner case "?\n\n"
if "?nn" in post_str:
a = post_str.split("nn")
# keep sentences that "stand alone"
if not small_list == []:
big_list.append(small_list)
small_list = []
else:
big_list.append(a)
last_sentence = big_list[-1]
if last_sentence == big_list[-1]:
time.sleep(7)
print('\n' + ' ' * left_margin + '+' + '-' * (box_width - 4) + '+')
print(' ' * left_margin + '| ' + ' ' * text_width + ' |')
print(' ' * left_margin + '| ' + output + ' |')
print(' ' * left_margin + '| ' + ' ' * text_width + ' |')
print(' ' * left_margin + '+' + '-' * (box_width - 4) + '+\n')
else:
print("\n")
print(output)
previous_output = output
output = ""
@staticmethod
def pretty_print(joke) -> str:
"""
Print in a humanly readable way
"""
# define empty string
output = ""
# list containing characters the don't need " " infront of them
for sentence in self.raw_joke:
for i, token in enumerate(sentence[:-1]):
# (i, token)
if token in ",.!?)":
output += token
elif sentence[i + 1] in "('’-–":
output += " " + token
elif token in "('’-–\"":
output += token
elif sentence[i - 1] in "('’-–\"":
output += token
else:
output += " " + token
if sentence[-1] in ",.!?\"":
output += sentence[-1]
else:
output += " " + sentence[-1]
# define dimensions for framing
screen_width = 80
text_width = len(output)
box_width = text_width + 6
left_margin = (screen_width - box_width) // 2
# print frame & sentence
print('\n' + ' ' * left_margin + '+' + '-' * (box_width - 4) + '+')
print(' ' * left_margin + '| ' + ' ' * text_width + ' |')
print(' ' * left_margin + '| ' + output + ' |')
print(' ' * left_margin + '| ' + ' ' * text_width + ' |')
print(' ' * left_margin + '+' + '-' * (box_width - 4) + '+\n')
def __repr__(self):
"""
responsible for the representation of the jokes
"""
# define empty string
output = ""
# list containing characters the don't need " " infront of them
for sentence in self.raw_joke:
for i, token in enumerate(sentence[:-1]):
# (i, token)
if token in ",.!?)":
output += token
elif sentence[i + 1] in "('’-–":
output += " " + token
elif token in "('’-–\"":
output += token
elif sentence[i - 1] in "('’-–\"":
output += token
else:
output += " " + token
if sentence[-1] in ",.!?\"":
output += sentence[-1]
else:
output += " " + sentence[-1]
# define dimensions for framing
screen_width = 80
text_width = len(output)
box_width = text_width + 6
left_margin = (screen_width - box_width) // 2
# print frame & sentence
print('\n' + ' ' * left_margin + '+' + '-' * (box_width - 4) + '+')
print(' ' * left_margin + '| ' + ' ' * text_width + ' |')
print(' ' * left_margin + '| ' + output + ' |')
print(' ' * left_margin + '| ' + ' ' * text_width + ' |')
print(' ' * left_margin + '+' + '-' * (box_width - 4) + '+\n')
def __eq__(self, other):
"""
checks if the score is equal to another score
"""
# TODO: your implementation here
index_of_self = jokes.index(self.raw_joke)
index_of_other = jokes.index(other.raw_joke)
if scores[self.i] == scores[other.i]:
return True
else:
return False
def __lt__(self, other):
"""
checks if the score is lower then the other
"""
# TODO: your implementation here
index_of_self = jokes.index(self.raw_joke)
index_of_other = jokes.index(other.raw_joke)
if scores[self.i] < scores[other.i]:
return True
else:
return False
def __gt__(self, other):
"""
checks if the score is greater then the other
"""
# TODO: your implementation here
index_of_self = jokes.index(self.raw_joke)
index_of_other = jokes.index(other.raw_joke)
if scores[self.i] > scores[other.i]:
return True
else:
return False
def __le__(self, other):
"""
checks if the score is lower or equal then the other
"""
# TODO: your implementation here
index_of_self = jokes.index(self.raw_joke)
index_of_other = jokes.index(other.raw_joke)
if scores[self.i] <= scores[other.i]:
return True
else:
return False
def __ge__(self, other):
"""
checks if the score is greater or equal then the other
"""
# TODO: your implementation here
index_of_self = jokes.index(self.raw_joke)
index_of_other = jokes.index(other.raw_joke)
if scores[self.i] >= scores[other.i]:
return True
else:
return False
class JokeGenerator:
"""
generate jokes of the file
"""
def __init__(self, filename) -> None:
# TODO: your implementation here
self.joke_list = filename
def make_jokes_objects(self) -> List:
"""
saves the jokes as objects
"""
# TODO: your implementation here
return [Joke(one_joke) for one_joke in self.joke_list]
def generate_jokes(self):
"""
#generate the jokes to tell it by the tell_joke method
"""
# TODO: your implementation here
j = Joke()
if len(j.split_into_sentences()) >= 2:
self.tell_joke()
def random_joke(self):
"""
chooses a random joke from the raw_joke
"""
# TODO: your implementation here
i = randint(0, (len(self.joke_list)-1))
j = Joke(self.joke_list[i])
chosen_joke = j.tell_joke()
#print(chosen_joke)
return chosen_joke
def main():
with open("dadjokes_sample.csv", "r", encoding="utf-8") as file:
#creating three new lists
jokes = []
scores = []
big_list = []
#read the csv by the DictReader and save it in read
read = csv.DictReader(file)
#iterate through read
for a in read:
#clear the big_list
big_list = []
#append the big_list by the values of the key "joke"
big_list.append(a.get("joke"))
#append the big_list by the values of the key "score"
big_list.append(a.get("score"))
#append the big_list by the values of the key "date"
big_list.append(a.get("date"))
#if the joke is too long (notizable by None) should also be added to the big_list
if None in a.keys():
big_list.extend(a.get(None))
#appending the jokes to the jokes list
jokes.append(big_list[0:-2])
#appending the jokes to the jokes list
scores.append(big_list[-2])
#bonus task
max_score = max(scores)
index_of_max_score = scores.index(max_score)
#print(f"The joke with the highest score is: {jokes[index_of_max_score]} with a score of {max_score}.")
#print(jokes)
#print(scores)
jg = JokeGenerator(jokes)
print(jg.random_joke())
if __name__ == '__main__':
main()