# CSS for line wrapping within the notebook
from IPython.display import HTML, display
def set_css():
display(HTML('<style> pre { white-space: pre-wrap; } </style>'))
get_ipython().events.register('pre_run_cell', set_css)
%%capture
!pip install -q nltk
!pip install -q spacy
!pip install -q rouge-score
!pip install -q transformers
!pip install -q -U pytextrank
!pip install -q youtube_transcript_api
!python -m spacy download en_core_web_sm
import re
import gc
import nltk
import torch
import spacy
import pytextrank
import pandas as pd
from math import ceil
import urllib.request
nltk.download('punkt')
from spacy.language import Language
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from youtube_transcript_api import YouTubeTranscriptApi as youtube
from transformers import LEDForConditionalGeneration, LEDTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu" # use GPU if available
# DeepMind x UCL: The Deep Learning Lecture Series 2020
# See: https://deepmind.com/learning-resources/deep-learning-lecture-series-2020
video_id = '8zAP2qWAsKg'
transcript_list = youtube.list_transcripts(video_id) # list of available transcripts
#print (transcript_list)
transcript = transcript_list.find_transcript(['en-GB']) # select manual English transcript
#print (transcript)
d = transcript.fetch() # get transcript data from the video
# Put the transcript data into a dataframe
df1 = pd.DataFrame(d)
df1.head()
# Attach time flag to phrases so we can extract them for sentence times
df1['start'] = df1['start'].fillna(0) # replace any NaNs with zeros
df1['start'] = df1['start'].astype(int).sub(1) # clip the decimals, subtract 1 sec
df1['time'] = 't=' + df1['start'].astype(str) # time with 't' prefix
df1['time_text'] = df1['time'] + ' ' + df1['text'] # combine text and time
df1 = df1.drop(columns=['start', 'duration'])
df1.head()
# Join all of the text strings
orig_text = ' '.join(df1['text']).replace('\n','').replace('\r','') \
.replace('\t','').replace(' ','')
time_text = ' '.join(df1['time_text']).replace('\n','').replace('\r','') \
.replace('\t','').replace(' ','')
# Segment sentences with nltk
sent_orig = sent_tokenize(orig_text)
sent_time = sent_tokenize(time_text)
# Append '\n' to force spaCy to follow nltk sentence segmentation
sent_orig_n = [s + '\n' for s in sent_orig]
# Join up the strings again to load into the nlp pipeline
orig_text_n = ''.join(map(str, sent_orig_n))
#print(orig_text_n)
# Extract timestamps and associate to sentences
df2_data = {'sentence': sent_orig, 'time_sent': sent_time}
df2 = pd.DataFrame(df2_data)
df2.insert(1, 'time', df2['time_sent'].str.extract('(t\=\d+)\s', expand=True))
df2['time'] = df2['time'].fillna(method='bfill')
df2 = df2.drop(columns='time_sent')
df2.head()
# Load the pipeline
nlp = spacy.load("en_core_web_sm")
# Force parser to segement sentences the same as nltk from earlier
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == "\n":
doc[token.i + 1].is_sent_start = True
else:
doc[token.i + 1].is_sent_start = False
return doc
algo = "textrank" #['textrank', 'positionrank', 'biasedtextrank']
nlp.add_pipe("set_custom_boundaries", before="parser")
nlp.add_pipe(algo, last=True)
doc = nlp(orig_text_n)
sentences = [sent.text for sent in doc.sents]
num_sents = len(sentences)
assert num_sents == len(df2)
#print(num_sents)
#print(sentences)
def count_words(df, column):
column = '%s' % column
num_words = sum([len(x.split()) for x in df[column].tolist()])
return num_words
def count_sentences(df):
num_sentences = len(df)
return num_sentences
#count_words(df2, 'sentence')
#count_sentences(df2)
# Number of words in the original transcript
orig_wc = count_words(df2, 'sentence')
print ('Transcript word count:', orig_wc)
# Number of sentences in the original transcript
orig_sc = count_sentences(df2)
print ('Transcript sentence count:', orig_sc)
# Rank the sentences for the summary
target = 30 # target % num sentences of the original
phrase_multiple = 5 # num phrases per num sentences to consider in the ranking
lim_sents = ceil(orig_sc * target/100)
lim_phrases = lim_sents * phrase_multiple
summ_gen = doc._.textrank.summary(limit_phrases=lim_phrases,
limit_sentences=lim_sents,
preserve_order=True)
# Generate the summary outputs
sents = []
starts = []
ends = []
for sent in summ_gen:
sents.append(sent.text)
starts.append(sent.start)
ends.append(sent.end)
#print(sent)
# Top-ranked words/phrases
keywords = ''
for phrase in doc._.phrases[:20]:
keywords += (phrase.text + ', ')
keywords = keywords[:-2] # drop last comma and space
#print(keywords)
# Put the summary in a new dataframe and merge with df2
df3_data = {'sentence': sents, 'start': starts, 'end': ends}
df3 = pd.DataFrame(df3_data)
df3['sentence'] = df3['sentence'].str[:-1] # Pop off the '\n' characters
df3 = df3.merge(df2, left_on='sentence',right_on='sentence', how='left')
# Uppercase the first word of each sentence
df3['sentence'] = [(s[:1].upper() + s[1:]) for s in df3['sentence']]
df3.head()
#list(df3['sentence'])
# Get word and sentence counts of summary and compare to original
summ_wc = count_words(df3, 'sentence')
summ_sc = count_sentences(df3)
reduction_wc = int(round((1-summ_wc/orig_wc)*100,0))
reduction_sc = int(round((1-summ_sc/orig_sc)*100,0))
print("Summary vs. original word count: ", summ_wc, "/", orig_wc)
print("Percentage reduction of words: ", reduction_wc)
print("\nSummary vs. original sentence count: ", summ_sc, "/", orig_sc)
print("Percentage reduction of sentences: ", reduction_sc)
# Get the title of the video
import requests
from bs4 import BeautifulSoup
url = 'https://youtu.be/' + video_id
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
for title in soup.find_all('title'):
video_title = title.get_text()
#print(video_title)
# Generate an abstract from the summary with LED model finetuned on PubMed
summ_out = ""
for row in df3.itertuples(index=False):
summ_out += row.sentence
tokenizer = LEDTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
input_ids = tokenizer(summ_out, return_tensors="pt").input_ids.to(device)
global_attention_mask = torch.zeros_like(input_ids)
# Set global_attention_mask on first token
global_attention_mask[:, 0] = 1
model = LEDForConditionalGeneration.from_pretrained(
"patrickvonplaten/led-large-16384-pubmed",
return_dict_in_generate=True).to(device)
sequences = model \
.generate(input_ids, global_attention_mask=global_attention_mask) \
.sequences.to(device)
del model
gc.collect()
torch.cuda.empty_cache()
decoded = tokenizer.batch_decode(sequences, skip_special_tokens=True)
# Clean up the decoded output
# Eliminate leading/trailing spaces
decoded = ' '.join(decoded).replace('\n ','').strip()
# Segment sentences with nltk
sent_abs = sent_tokenize(decoded)
# Uppercase the start of sentences
sent_abs = [(s[:1].upper() + s[1:]) for s in sent_abs]
abstract = ' '.join(sent_abs)
# Fix casing like 'bERT' and 'deepMind'
regex = re.findall("([a-z]+[A-Z])", abstract)
for r in regex:
up = r[:1].upper() + r[1:]
abstract = re.sub(r, up, abstract)
#print(abstract)
# Output the summary to a text file
machine_filename = 'summary_%s_%s_%s.txt' % (algo, summ_sc, summ_wc)
with open((machine_filename), 'w') as f:
f.write(summ_out)
# Build the summary output with html tags
summ_html = "<p>"
prior_sent_end = 0 # keep track of the end of the prior sentence
for row in df3.itertuples(index=False):
# Group in the same paragraph if within 100 chars of the end of the prior sentence
if row.start > (prior_sent_end + 100):
summ_html += (('</p><p><a href="%s">' % ('#' + row.time)) + row.sentence + '</a> ')
else:
summ_html += (('<a href="%s">' % ('#' + row.time)) + row.sentence + '</a> ')
# Update the prior sentence end to the current sentence end
prior_sent_end = row.end
# Build the full transcript output with html tags
full_html = ""
for row in df2.itertuples(index=False):
full_html += ('<p id="%s"><a href="%s" target="_blank">%s</a></p>'
% (row.time, (url + '?' + row.time), row.sentence))
# User information for the html file
overview = '''
<p>This file presents the results of automatic summarization of an online
lecture video titled: <i><a href='%s' target="_blank">%s</a></i>. The abstract
is generated using <b>LED</b>, or Longformer-Encoder-Decoder, a
state-of-the-art Transformer-based language model. This implementation uses a
pre-trained model, fine-tuned on <b>PubMed</b>, a long-range summarization
dataset. The top-ranked words/phases and sentences are extracted from the
original transcript of the video to produce a summary using <b>%s</b>, an
unsupervised graph-based algorithm. The sentences for the summary are returned
in the order of original occurrence in the transcript (i.e., not ranked order).
Words and sentences in the summary are reduced by %s%% and %s%%, respectively,
compared with the original transcript. Sentences are grouped into paragraphs
based on their positional locations. Long paragraphs indicate several sentences
in close proximity with minimal pruning between them. Short paragraphs and
orphaned sentences suggest that more context may be needed. The final section is
the full extracted transcript, line by line. Sentences in the 'Summary' section
are hyperlinked to the 'Full Transcript' section. Sentences in the 'Full
Transcript' section are hyperlinked to the video at the approximate time of
utterance.</p> ''' % (url, video_title, algo, reduction_wc, reduction_sc)
# Combine all sections into an HTML file
html_summary = '''
<html>
<head>
<meta charset="UTF-8">
<style>
body {max-width: 960px; margin: 20px auto;}
a:link, a:visited {text-decoration: none; color: #000000;}
:target {background-color: yellow;}
</style>
</head>
<body>
<h1>Overview</h1>
%s
<h1>Abstract</h1>
<p>%s</p>
<h1>Keywords/phrases</h1>
<p>%s</p>
<h1>Summary</h1>
%s
<h1>Full Transcript</h1>
%s
</body>
</html>
'''
with open('summary.html', 'w') as f:
f.write(html_summary % (overview, abstract, keywords, summ_html, full_html))
# Open the human summary
human_url = 'https://mkreager.github.io/nlp-summarization/human.txt'
with urllib.request.urlopen(human_url) as response:
human_file = response.read().decode('utf-8')
human_wc = len(human_file.split())
human_sc = len(sent_tokenize(human_file))
assert human_sc == summ_sc
#print(human_file)
print('Human summary word count:', human_wc)
print('Human summary sentence:', human_sc)
# Open the machine summary
with open(machine_filename, 'r') as f:
machine_file = f.read()
# Evaluate the machine summary vs. human summary
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(human_file, machine_file)
print(scores)
# Demonstrate Biased TextRank with a bias on "reinforcement"
#nlp = spacy.load("en_core_web_sm")
#nlp.add_pipe("biasedtextrank", last=True)
#doc = nlp(orig_text_n)
#tr = doc._.textrank
#tr.change_focus(focus="reinforcement", bias=1.0, default_bias=0.0)
#for sent in tr.summary(limit_phrases=50, limit_sentences=3, preserve_order=True):
#print(sent.start, sent)