In [28]:
# CSS for line wrapping within the notebook
from IPython.display import HTML, display
def set_css():
  display(HTML('<style> pre { white-space: pre-wrap; } </style>'))
get_ipython().events.register('pre_run_cell', set_css)
In [29]:
%%capture
!pip install -q nltk
!pip install -q rouge-score
!pip install -q youtube_transcript_api
!pip install -q bert-extractive-summarizer
In [30]:
import nltk
import pandas as pd
import urllib.request
nltk.download('punkt')
from summarizer import Summarizer
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from youtube_transcript_api import YouTubeTranscriptApi as youtube
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [31]:
# DeepMind x UCL: The Deep Learning Lecture Series 2020
# See: https://deepmind.com/learning-resources/deep-learning-lecture-series-2020
video_id = '8zAP2qWAsKg'
transcript_list = youtube.list_transcripts(video_id)    # list of available transcripts
#print (transcript_list)
transcript = transcript_list.find_transcript(['en-GB']) # select manual English transcript
#print (transcript)
d = transcript.fetch()                                  # get transcript data from the video
In [32]:
# Put the transcript data into a dataframe
df1 = pd.DataFrame(d)
df1.head()
Out[32]:
text start duration
0 Hello and welcome to the 7.68 3.16
1 UCL and DeepMind lecture series. 10.84 3.20
2 My name's Felix Hill and I'm 14.64 2.24
3 going to be talking to you about 16.88 2.48
4 deep learning and language 19.36 2.52
In [33]:
# Join all of the text strings
body = ' '.join(df1['text']).replace('\n','').replace('\r','') \
                                 .replace('\t','').replace('  ','')
In [34]:
# Run the model and generate the summary
model = Summarizer()
result = model(body, num_sentences=144) #[ratio=0.3, num_sentences=144]
full = ''.join(result)
#print(full)
In [35]:
# Segement sentences with nltk and get counts
sentences = sent_tokenize(full)
summ_wc = len(full.split())
summ_sc = len(sentences)
print('Machine summary word count:', summ_wc)
print('Machine summary sentence count:', summ_sc)
Machine summary word count: 3713
Machine summary sentence count: 144
In [36]:
# Output the summary to a text file
machine_filename = 'summary_bert_%s_%s.txt' % (summ_sc, summ_wc)
with open(machine_filename, 'w') as f:
    f.write(full)
In [40]:
# Open the machine summary
with open(machine_filename, 'r') as f:
    machine_file = f.read()
In [41]:
# Open the human summary and get counts
human_url = 'https://mkreager.github.io/nlp-summarization/human.txt'

with urllib.request.urlopen(human_url) as response:
   human_file = response.read().decode('utf-8')

human_wc = len(human_file.split())
human_sc = len(sent_tokenize(human_file))
assert human_sc == summ_sc

#print(human_file)
print('Human summary word count:', human_wc)
print('Human summary sentence:', human_sc)
Human summary word count: 6703
Human summary sentence: 144
In [42]:
# Evaluate the machine summary vs. human summary
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(human_file, machine_file)
print(scores)
{'rouge1': Score(precision=0.913570487483531, recall=0.5070195963732086, fmeasure=0.6521207561365561), 'rouge2': Score(precision=0.6172904586188719, recall=0.34254790112622496, fmeasure=0.44059825039977424), 'rougeL': Score(precision=0.52832674571805, recall=0.29321439017256506, fmeasure=0.37712780964920534)}