# CSS for line wrapping within the notebook
from IPython.display import HTML, display
def set_css():
display(HTML('<style> pre { white-space: pre-wrap; } </style>'))
get_ipython().events.register('pre_run_cell', set_css)
%%capture
!pip install -q nltk
!pip install -q rouge-score
!pip install -q youtube_transcript_api
!pip install -q bert-extractive-summarizer
import nltk
import pandas as pd
import urllib.request
nltk.download('punkt')
from summarizer import Summarizer
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from youtube_transcript_api import YouTubeTranscriptApi as youtube
# DeepMind x UCL: The Deep Learning Lecture Series 2020
# See: https://deepmind.com/learning-resources/deep-learning-lecture-series-2020
video_id = '8zAP2qWAsKg'
transcript_list = youtube.list_transcripts(video_id) # list of available transcripts
#print (transcript_list)
transcript = transcript_list.find_transcript(['en-GB']) # select manual English transcript
#print (transcript)
d = transcript.fetch() # get transcript data from the video
# Put the transcript data into a dataframe
df1 = pd.DataFrame(d)
df1.head()
# Join all of the text strings
body = ' '.join(df1['text']).replace('\n','').replace('\r','') \
.replace('\t','').replace(' ','')
# Run the model and generate the summary
model = Summarizer()
result = model(body, num_sentences=144) #[ratio=0.3, num_sentences=144]
full = ''.join(result)
#print(full)
# Segement sentences with nltk and get counts
sentences = sent_tokenize(full)
summ_wc = len(full.split())
summ_sc = len(sentences)
print('Machine summary word count:', summ_wc)
print('Machine summary sentence count:', summ_sc)
# Output the summary to a text file
machine_filename = 'summary_bert_%s_%s.txt' % (summ_sc, summ_wc)
with open(machine_filename, 'w') as f:
f.write(full)
# Open the machine summary
with open(machine_filename, 'r') as f:
machine_file = f.read()
# Open the human summary and get counts
human_url = 'https://mkreager.github.io/nlp-summarization/human.txt'
with urllib.request.urlopen(human_url) as response:
human_file = response.read().decode('utf-8')
human_wc = len(human_file.split())
human_sc = len(sent_tokenize(human_file))
assert human_sc == summ_sc
#print(human_file)
print('Human summary word count:', human_wc)
print('Human summary sentence:', human_sc)
# Evaluate the machine summary vs. human summary
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(human_file, machine_file)
print(scores)