IntelliScript/Summarizer.py at main · Khushi-Chaudhary04/IntelliScript · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import nltk
import os

# Set the path to the Intelliscript folder on your desktop
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "IntelliScript")

# Create the nltk_data directory within the Intelliscript folder
nltk_data_path = os.path.join(desktop_path, "nltk_data")
if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)

# Set the NLTK data path to the newly created nltk_data directory
nltk.data.path.append(nltk_data_path)

# Download the 'punkt' package
nltk.download('punkt')


from transformers import pipeline
from nltk.tokenize import sent_tokenize
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from transformers import pipeline, AutoTokenizer
import unicodedata
import logging

logging.getLogger("transformers").setLevel(logging.ERROR)

def get_summary(manual_subtitles, text, model_choice):
    if manual_subtitles:
        extractive_summary = get_extractive_summary(text)
        abstractive_summary = get_abstractive_summary(extractive_summary, model_choice)
    else:
        abstractive_summary = get_abstractive_summary(text, model_choice)

    return abstractive_summary

def get_abstractive_summary(extractive_summary, model_choice):
    models = {
        0: "facebook/bart-large-cnn",
        1: "t5-base"
    }
    model_name = models[model_choice]

    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=1024)  # Adjust max length as needed

    # Load the model for summarization
    generator = pipeline('summarization', model=model_name)

    # Generate abstractive summary
    abstractive_summary = generator(extractive_summary, max_length=100, min_length=5, do_sample=True, early_stopping=True)[0]['summary_text']

    return abstractive_summary

def get_extractive_summary(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    req_sentences = round(len(sent_tokenize(text)) * 0.70)
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, req_sentences)

    ext_summary = " ".join(str(sentence) for sentence in summary)
    return ext_summary

def clean_summary(text):
    error_dict = [...]
    for error in error_dict:
        if error == text:
            return text

    irrelevant_terms = ["[music]", "[Music]", "\n", "<<", ">>"]
    sentence_list = [sentence.replace(item, "").strip() for sentence in sent_tokenize(text) for item in irrelevant_terms]
    cleaned_text = " ".join(sentence.capitalize() for sentence in sentence_list)
    normalized_text = unicodedata.normalize('NFKD', cleaned_text)
    formatted_text = normalized_text.encode('ascii', 'ignore').decode('ascii')
    return formatted_text.replace("\'", "'")

# Example usage
print(get_summary(True, "Nvidia is a leading technology company in the field of graphics processing units (GPUs) and artificial intelligence (AI). Founded in 1993, Nvidia has played a significant role in shaping the modern computing landscape.", 1))