# CSS for line wrapping within the notebook
from IPython.display import HTML, display
def set_css():
  display(HTML('<style> pre { white-space: pre-wrap; } </style>'))
get_ipython().events.register('pre_run_cell', set_css)

%%capture
!pip install -q nltk
!pip install -q spacy
!pip install -q rouge-score
!pip install -q transformers
!pip install -q -U pytextrank
!pip install -q youtube_transcript_api
!python -m spacy download en_core_web_sm

import re
import gc
import nltk
import torch
import spacy
import pytextrank
import pandas as pd
from math import ceil
import urllib.request
nltk.download('punkt')
from spacy.language import Language
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from youtube_transcript_api import YouTubeTranscriptApi as youtube
from transformers import LEDForConditionalGeneration, LEDTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu" # use GPU if available

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

# DeepMind x UCL: The Deep Learning Lecture Series 2020
# See: https://deepmind.com/learning-resources/deep-learning-lecture-series-2020
video_id = '8zAP2qWAsKg'
transcript_list = youtube.list_transcripts(video_id)    # list of available transcripts
#print (transcript_list)
transcript = transcript_list.find_transcript(['en-GB']) # select manual English transcript
#print (transcript)
d = transcript.fetch()                                  # get transcript data from the video

# Put the transcript data into a dataframe
df1 = pd.DataFrame(d)
df1.head()

# Attach time flag to phrases so we can extract them for sentence times
df1['start'] = df1['start'].fillna(0)              # replace any NaNs with zeros
df1['start'] = df1['start'].astype(int).sub(1)     # clip the decimals, subtract 1 sec
df1['time'] = 't=' + df1['start'].astype(str)       # time with 't' prefix
df1['time_text'] = df1['time'] + ' ' + df1['text'] # combine text and time
df1 = df1.drop(columns=['start', 'duration'])
df1.head()

# Join all of the text strings
orig_text = ' '.join(df1['text']).replace('\n','').replace('\r','') \
                                 .replace('\t','').replace('  ','')
time_text = ' '.join(df1['time_text']).replace('\n','').replace('\r','') \
                                      .replace('\t','').replace('  ','')
# Segment sentences with nltk
sent_orig = sent_tokenize(orig_text)
sent_time = sent_tokenize(time_text)

# Append '\n' to force spaCy to follow nltk sentence segmentation
sent_orig_n = [s + '\n' for s in sent_orig]

# Join up the strings again to load into the nlp pipeline
orig_text_n = ''.join(map(str, sent_orig_n))
#print(orig_text_n)

# Extract timestamps and associate to sentences
df2_data = {'sentence': sent_orig, 'time_sent': sent_time}
df2 = pd.DataFrame(df2_data)
df2.insert(1, 'time', df2['time_sent'].str.extract('(t\=\d+)\s', expand=True))
df2['time'] = df2['time'].fillna(method='bfill')
df2 = df2.drop(columns='time_sent')
df2.head()

# Load the pipeline
nlp = spacy.load("en_core_web_sm")

# Force parser to segement sentences the same as nltk from earlier 
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == "\n":
      doc[token.i + 1].is_sent_start = True
    else:
      doc[token.i + 1].is_sent_start = False
  return doc

algo = "textrank" #['textrank', 'positionrank', 'biasedtextrank']
nlp.add_pipe("set_custom_boundaries", before="parser")
nlp.add_pipe(algo, last=True)
doc = nlp(orig_text_n)

sentences = [sent.text for sent in doc.sents]
num_sents = len(sentences)
assert num_sents == len(df2)
#print(num_sents)
#print(sentences)

def count_words(df, column):
  column = '%s' % column
  num_words = sum([len(x.split()) for x in df[column].tolist()])
  return num_words

def count_sentences(df):
  num_sentences = len(df)
  return num_sentences

#count_words(df2, 'sentence')
#count_sentences(df2)

# Number of words in the original transcript
orig_wc = count_words(df2, 'sentence')
print ('Transcript word count:', orig_wc)

# Number of sentences in the original transcript
orig_sc = count_sentences(df2)
print ('Transcript sentence count:', orig_sc)

Transcript word count: 14894
Transcript sentence count: 480

# Rank the sentences for the summary
target = 30         # target % num sentences of the original
phrase_multiple = 5 # num phrases per num sentences to consider in the ranking 
lim_sents = ceil(orig_sc * target/100)
lim_phrases = lim_sents * phrase_multiple

summ_gen = doc._.textrank.summary(limit_phrases=lim_phrases, 
                                  limit_sentences=lim_sents, 
                                  preserve_order=True)

# Generate the summary outputs
sents = []
starts = []
ends = []
for sent in summ_gen:
    sents.append(sent.text)
    starts.append(sent.start)
    ends.append(sent.end)
    #print(sent)

# Top-ranked words/phrases
keywords = ''
for phrase in doc._.phrases[:20]:
    keywords += (phrase.text + ', ')
keywords = keywords[:-2] # drop last comma and space
#print(keywords)

# Put the summary in a new dataframe and merge with df2
df3_data = {'sentence': sents, 'start': starts, 'end': ends}
df3 = pd.DataFrame(df3_data)
df3['sentence'] = df3['sentence'].str[:-1] # Pop off the '\n' characters
df3 = df3.merge(df2, left_on='sentence',right_on='sentence', how='left')

# Uppercase the first word of each sentence
df3['sentence'] = [(s[:1].upper() + s[1:]) for s in df3['sentence']]

df3.head()
#list(df3['sentence'])

# Get word and sentence counts of summary and compare to original
summ_wc = count_words(df3, 'sentence')
summ_sc = count_sentences(df3)
reduction_wc = int(round((1-summ_wc/orig_wc)*100,0))
reduction_sc = int(round((1-summ_sc/orig_sc)*100,0))

print("Summary vs. original word count: ", summ_wc, "/", orig_wc)
print("Percentage reduction of words: ", reduction_wc)
print("\nSummary vs. original sentence count: ", summ_sc, "/", orig_sc)
print("Percentage reduction of sentences: ", reduction_sc)

Summary vs. original word count:  6434 / 14894
Percentage reduction of words:  57

Summary vs. original sentence count:  144 / 480
Percentage reduction of sentences:  70

# Get the title of the video
import requests
from bs4 import BeautifulSoup
  
url = 'https://youtu.be/' + video_id
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')

for title in soup.find_all('title'):
    video_title = title.get_text()
    #print(video_title)

# Generate an abstract from the summary with LED model finetuned on PubMed
summ_out = ""
for row in df3.itertuples(index=False):
    summ_out += row.sentence

tokenizer = LEDTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")

input_ids = tokenizer(summ_out, return_tensors="pt").input_ids.to(device)
global_attention_mask = torch.zeros_like(input_ids)

# Set global_attention_mask on first token
global_attention_mask[:, 0] = 1

model = LEDForConditionalGeneration.from_pretrained(
    "patrickvonplaten/led-large-16384-pubmed", 
    return_dict_in_generate=True).to(device)

sequences = model \
            .generate(input_ids, global_attention_mask=global_attention_mask) \
            .sequences.to(device)

del model
gc.collect()
torch.cuda.empty_cache()

decoded = tokenizer.batch_decode(sequences, skip_special_tokens=True)

# Clean up the decoded output
# Eliminate leading/trailing spaces
decoded = ' '.join(decoded).replace('\n ','').strip()

# Segment sentences with nltk
sent_abs = sent_tokenize(decoded)

# Uppercase the start of sentences
sent_abs = [(s[:1].upper() + s[1:]) for s in sent_abs] 
abstract = ' '.join(sent_abs)

# Fix casing like 'bERT' and 'deepMind'
regex = re.findall("([a-z]+[A-Z])", abstract) 
for r in regex:
  up = r[:1].upper() + r[1:]
  abstract = re.sub(r, up, abstract)
#print(abstract)

# Output the summary to a text file
machine_filename = 'summary_%s_%s_%s.txt' % (algo, summ_sc, summ_wc)
with open((machine_filename), 'w') as f:
    f.write(summ_out)

# Build the summary output with html tags
summ_html = "<p>"
prior_sent_end = 0  # keep track of the end of the prior sentence

for row in df3.itertuples(index=False):
    # Group in the same paragraph if within 100 chars of the end of the prior sentence
    if row.start > (prior_sent_end + 100):
      summ_html += (('</p><p><a href="%s">' % ('#' + row.time)) + row.sentence + '</a> ')
    else:
      summ_html += (('<a href="%s">' % ('#' + row.time)) + row.sentence + '</a> ')

    # Update the prior sentence end to the current sentence end
    prior_sent_end = row.end

# Build the full transcript output with html tags
full_html = ""
for row in df2.itertuples(index=False):
    full_html += ('<p id="%s"><a href="%s" target="_blank">%s</a></p>' 
                  % (row.time, (url + '?' + row.time), row.sentence))

# User information for the html file
overview = '''
<p>This file presents the results of automatic summarization of an online 
lecture video titled: <i><a href='%s' target="_blank">%s</a></i>. The abstract 
is generated using <b>LED</b>, or Longformer-Encoder-Decoder, a 
state-of-the-art Transformer-based language model. This implementation uses a   
pre-trained model, fine-tuned on <b>PubMed</b>, a long-range summarization   
dataset. The top-ranked words/phases and sentences are extracted from the   
original transcript of the video to produce a summary using <b>%s</b>, an  
unsupervised graph-based algorithm. The sentences for the summary are returned  
in the order of original occurrence in the transcript (i.e., not ranked order).  
Words and sentences in the summary are reduced by %s%% and %s%%, respectively, 
compared with the  original transcript. Sentences are grouped into paragraphs  
based on their positional locations. Long paragraphs indicate several sentences  
in close proximity with minimal pruning between them. Short paragraphs and 
orphaned sentences suggest that more context may be needed. The final section is 
the full extracted transcript, line by line. Sentences in the 'Summary' section 
are hyperlinked to the 'Full Transcript' section. Sentences in the 'Full 
Transcript' section are hyperlinked to the video at the approximate time of 
utterance.</p> ''' % (url, video_title, algo, reduction_wc, reduction_sc)

# Combine all sections into an HTML file
html_summary = '''
<html>
  <head>
    <meta charset="UTF-8">
    <style>
      body {max-width: 960px; margin: 20px auto;}
      a:link, a:visited {text-decoration: none; color: #000000;}
      :target {background-color: yellow;}
    </style>
  </head> 
  <body>
    <h1>Overview</h1>
    %s
    <h1>Abstract</h1>
    <p>%s</p>
    <h1>Keywords/phrases</h1>
    <p>%s</p>
    <h1>Summary</h1>
    %s
    <h1>Full Transcript</h1>
    %s
  </body>
</html>
'''
 
with open('summary.html', 'w') as f:
  f.write(html_summary % (overview, abstract, keywords, summ_html, full_html))

# Open the human summary
human_url = 'https://mkreager.github.io/nlp-summarization/human.txt'

with urllib.request.urlopen(human_url) as response:
   human_file = response.read().decode('utf-8')

human_wc = len(human_file.split())
human_sc = len(sent_tokenize(human_file))
assert human_sc == summ_sc

#print(human_file)
print('Human summary word count:', human_wc)
print('Human summary sentence:', human_sc)

Human summary word count: 6703
Human summary sentence: 144

# Open the machine summary
with open(machine_filename, 'r') as f:
    machine_file = f.read()

# Evaluate the machine summary vs. human summary
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(human_file, machine_file)
print(scores)

{'rouge1': Score(precision=0.8879889975550123, recall=0.8498098859315589, fmeasure=0.8684800478254372), 'rouge2': Score(precision=0.7152682255845942, recall=0.6845107503290917, fmeasure=0.6995515695067266), 'rougeL': Score(precision=0.6428789731051344, recall=0.6152383737935069, fmeasure=0.6287550440890748)}

# Demonstrate Biased TextRank with a bias on "reinforcement"

#nlp = spacy.load("en_core_web_sm")
#nlp.add_pipe("biasedtextrank", last=True)
#doc = nlp(orig_text_n)

#tr = doc._.textrank
#tr.change_focus(focus="reinforcement", bias=1.0, default_bias=0.0)

#for sent in tr.summary(limit_phrases=50, limit_sentences=3, preserve_order=True):
    #print(sent.start, sent)

	text	start	duration
0	Hello and welcome to the	7.68	3.16
1	UCL and DeepMind lecture series.	10.84	3.20
2	My name's Felix Hill and I'm	14.64	2.24
3	going to be talking to you about	16.88	2.48
4	deep learning and language	19.36	2.52

	text	time	time_text
0	Hello and welcome to the	t=6	t=6 Hello and welcome to the
1	UCL and DeepMind lecture series.	t=9	t=9 UCL and DeepMind lecture series.
2	My name's Felix Hill and I'm	t=13	t=13 My name's Felix Hill and I'm
3	going to be talking to you about	t=15	t=15 going to be talking to you about
4	deep learning and language	t=18	t=18 deep learning and language

	sentence	time
0	Hello and welcome to the UCL and DeepMind lect...	t=6
1	My name's Felix Hill and I'm going to be talki...	t=13
2	So here's an overview of the structure of toda...	t=24
3	It's going to be divided into four sections.	t=27
4	So in the first section I'll talk a little bit...	t=31

	sentence	start	end	time
0	My name's Felix Hill and I'm going to be talki...	12	34	t=13
1	So in the first section I'll talk a little bit...	59	123	t=31
2	In the second section I'll focus in on one par...	123	158	t=57
3	And that model is the transformer which was re...	158	225	t=67
4	And then in the final section, we'll take a bi...	225	295	t=95