Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@
| [news_nlp](/projects/news_nlp) | Строкова Анастасия Владиславовна |
| [newsgroups-classification](/projects/newsgroups-classification) | Герасимчук Михаил Юрьевич |
| [fake-news-classifier](/projects/fake-news-classifier) | Артемьев Алексей Дмитриевич |
| [news-classifier](projects/news-classifier) | Солодкая Мария Александровна |
1 change: 1 addition & 0 deletions projects/news-classifier/lab1_tokenizer.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":2538,"status":"ok","timestamp":1705424302454,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"1wjQhvdsb0dE"},"outputs":[],"source":["import re # работа с регулярными выражениями\n","import os\n","import nltk # для обработки естественного языка\n","import pandas as pd\n","from pathlib import Path\n","from nltk import SnowballStemmer, WordNetLemmatizer # для стемминга и лемматизации\n","from nltk.corpus import wordnet"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":11638,"status":"ok","timestamp":1705424314089,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"PzovMpj5d17E","outputId":"8e68f6b9-5435-4fc0-ed5e-f9554248d8cb"},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":370,"status":"ok","timestamp":1705424314456,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"dubD_L4Zb4g7","outputId":"d1d2d7a5-4e8b-44a3-afdd-90f05a3cbade"},"outputs":[],"source":["nltk.download('averaged_perceptron_tagger')\n","nltk.download('wordnet')\n","\n","# стемминг - обрезания слова до его основы\n","# SnowballStemmer полезен при анализе текстов\n","# для уменьшения количества уникальных слов и\n","# упрощения анализа. Он работает на основе\n","# алгоритма Портера и может обрабатывать слова\n","# на разных языках.\n","stemmer = SnowballStemmer(\"english\")\n","\n","# лемматизация - приведения слова к его базовой форме\n","# WordNetLemmatizer может уменьшить количество уникальных\n","# слов в тексте и упростить анализ. Он использует базу\n","# данных WordNet для определения базовой формы\n","# слова в зависимости от его части речи.\n","lemmatizer = WordNetLemmatizer()"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1705424314456,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"9xrnTqhAb6x-"},"outputs":[],"source":["# знаки препинания, обозначающие конец предложения\n","end_of_clause = ['.', '?', '!', '...']\n","\n","# принимает слово и возвращает его часть речи\n","def get_wordnet_pos(word):\n"," # pos_tag принимает список слов и возвращает список кортежей,\n"," # каждый из которых содержит слово и его POS-тег\n"," tag = nltk.pos_tag([word])[0][1][0].upper()\n"," # сопоставление POS-тега с тегом WordNet\n"," tag_dict = {\"J\": wordnet.ADJ,\n"," \"N\": wordnet.NOUN,\n"," \"V\": wordnet.VERB,\n"," \"R\": wordnet.ADV}\n"," # если тег не найден, то возвращает существительное\n"," return tag_dict.get(tag, wordnet.NOUN)"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1705424314456,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"Z2EaaZ9tb-Hc"},"outputs":[],"source":["# словарь knownAbbrevs, содержащий сокращения и их полные формы\n","knownAbbrevs = {\n"," \"col.\": \"college\",\n"," \"messrs.\": \"messieurs\",\n"," \"gov.\": \"government\",\n"," \"adm.\": \"admiral\",\n"," \"rev.\": \"revolution\",\n"," \"fr.\": \"french\",\n"," \"maj.\": \"major\",\n"," \"sgt.\": \"sergeant\",\n"," \"cpl.\": \"corporal\",\n"," \"pvt.\": \"private\",\n"," \"capt.\": \"captain\",\n"," \"ave.\": \"avenue\",\n"," \"pres.\": \"president\",\n"," \"brig.\": \"brigadier\",\n"," \"cmdr.\": \"commander\",\n"," \"asst.\": \"assistant\",\n"," \"assoc.\": \"associate\",\n"," \"insp.\": \"inspiration\",\n"," \"st.\": \"saint\",\n"," \"dr.\": \"doctor\",\n"," \"tel.\": \"telephone\",\n"," \"no.\": \"number\",\n"," \"u.s.\": \"United States\",\n"," \"u.k.\": \"United Kingdom\",\n"," \"prof.\": \"professor\",\n"," \"inc.\": \"incorporated\",\n"," \"ltd.\": \"limited\",\n"," \"corp.\": \"corporation\",\n"," \"co.\": \"corporation\",\n"," \"mr.\": \"mister\",\n"," \"plc.\": \"Public Limited Company\",\n"," \"assn.\": \"association\",\n"," \"univ.\": \"university\",\n"," \"intl.\": \"international\",\n"," \"sys.\": \"system\",\n"," \"est.\": \"Eastern Standard Time\",\n"," \"ext.\": \"extention\",\n"," \"sq.\": \"square\",\n"," \"jr.\": \"junior\",\n"," \"sr.\": \"senior\",\n"," \"bros.\": \"brothers\",\n"," \"ed.d.\": \"Doctor of Education\",\n"," \"ph.d.\": \"Doctor of Phylosophy\",\n"," \"sci.\": \"Science\",\n"," \"etc.\": \"Et Cetera\",\n"," \"al.\": \"al\",\n"," \"seq.\": \"sequence\",\n"," \"orig.\": \"original\",\n"," \"incl.\": \"include\",\n"," \"eg.\": \"eg\",\n"," \"avg.\": \"average\",\n"," \"pl.\": \"place\",\n"," \"min.\": \"min\",\n"," \"max.\": \"max\",\n"," \"cit.\": \"citizen\",\n"," \"mrs.\": \"mrs\",\n"," \"mx.\": \"mx\",\n"," \"miss.\": \"miss\",\n"," \"atty.\": \"attorney\"\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1705424314456,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"1KPYqHIccAjB"},"outputs":[],"source":["# регулярные выражения\n","tokens = [\n"," # аббревиатуры\n"," [\"abbrev\", \"|\".join(map(lambda kv: \"(?i:\" + re.escape(kv[0]) + \")\", knownAbbrevs.items()))],\n"," # IP-адреса\n"," [\"ipaddress\", \"[0-9]+\\\\.[0-9]+\\\\.[0-9]+\\\\.[0-9]+\"],\n"," # числительные (5th)\n"," [\"numeral\", \"[0-9]+((th)|(\\\\'s))\"],\n"," # цифры с дефисом и буквами (123A, 456-B, 789c)\n"," [\"metrics\", \"[0-9]+(-)?[A-Z]?[a-z]+\"],\n"," # последовательность с $ ($2,525)\n"," [\"currency\", \"\\\\$[0-9]+(\\\\,[0-9]+)*\"],\n"," # числа (в том числе дробные)\n"," [\"number\", \"[0-9]+(.|,|-)[0-9]*\"],\n"," # пробелы, новая строка, слэш, табуляция\n"," [\"whitespace\", \"\\\\s|\\\\n|\\\\\\\\|\\\\t\"],\n"," # скобки\n"," [\"braces\", \"\\\\(|\\\\)\"],\n"," # слова в кавычках\n"," [\"quoted\", \"(\\\\\\\")[^\\\\\\\"]*(\\\\\\\")\"],\n"," # пунктуация (, . ? ! ...)\n"," [\"punct\", \",|\\\\.|\\\\?|\\\\!|(\\\\.\\\\.\\\\.)\"],\n"," # обычные слова, с апострофом (won't), с дефисом (twenty-five)\n"," [\"word\", \"[A-Za-z][A-Za-z\\\\']*(-[A-Z\\\\']?[A-Za-z\\\\']+)*\"],\n"," # одиночные символы, символы, которые не являются буквенно-цифровыми\n"," [\"other\", \".[^a-zA-Z0-9]*\"],\n"," # телефонные номера, например: +7-901-000-00-00, 8(918)3213412 и т.д.\n"," [\"mobile_phone\", \"^(?:\\+7|8)(?:(?:-\\d{3}-|\\(\\d{3}\\))\\d{3}-\\d{2}-\\d{2}|\\d{10})\"],\n"," # электронная почта, например: abc@abc.abc\n"," [\"mail\", \"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\"],\n","]"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":313,"status":"ok","timestamp":1705424314767,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"15X-sFAZcDCY"},"outputs":[],"source":["# компиляция регулярного выражения, которое будет использоваться\n","# для идентификации токенов на основе заданных выражений\n","regex = re.compile(\"^(\" + \"|\".join(map(lambda t: \"(?P<\" + t[0] + \">\" + t[1] + \")\", tokens)) + \")\")\n","\n","# словарь с классами новостей\n","classes = dict([\n"," (1, 'World'),\n"," (2, 'Sports'),\n"," (3, 'Business'),\n"," (4, 'Sci-Tech')\n","])"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":6,"status":"ok","timestamp":1705424314768,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"UTCcLnUKcGbK"},"outputs":[],"source":["# токенизация\n","def tokenize_text(text):\n"," # позиция в тексте\n"," pos = 0\n"," s = text\n"," # найденные токены\n"," line = []\n"," while len(s) > 0:\n"," # поиск первого совпадения с регулярным выражением\n"," match = regex.search(s)\n"," # проверка, что совпадение найдено и конечная позиция совпадения больше начальной позиции\n"," if match and match.endpos > match.pos:\n"," for gr in tokens:\n"," # только те значения из группы захвата, которые не являются None\n"," tt = list(filter(lambda kv: kv[1] is not None, match.groupdict().items()))\n"," # проверка, что список tt содержит ровно один элемент\n"," if len(tt) == 1:\n"," # значение ключа токена\n"," kind = tt[0][0]\n"," # значение значения токена\n"," part = tt[0][1]\n"," # случай работы с аббревиатурами\n"," if kind == 'abbrev':\n"," kind = 'word'\n"," part = knownAbbrevs[part.lower()]\n"," # новый элемент в список, состоящий из: позиции в тексте, типа токена и самого токена\n"," line.append([pos, kind, part])\n"," # увеличение значения переменной pos на длину токена\n"," pos += len(tt[0][1])\n"," # урезание с конца на длину токена\n"," s = s[len(tt[0][1]):]\n"," break\n"," else:\n"," print('tokenization is not possible: ' + s)\n"," else:\n"," print('tokenization is not possible: ' + s)\n"," return line"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":5,"status":"ok","timestamp":1705424314768,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"agiUqsLpcI9A"},"outputs":[],"source":["def operation_with_file(fname):\n"," print('working on ', fname)\n"," df = pd.read_csv(fname, sep=',', header=None)\n"," data = df.values\n"," data_count = len(data)\n"," # номер обрабатываемого файла\n"," n = 0\n"," for row in data:\n"," class_id = row[0]\n"," try:\n"," dir_path = \"/content/drive/MyDrive/ITMO/sem_3/NLP/assets/annotated-corpus/\" + Path(fname).name.split('.')[0] + \"/\" + classes[class_id] + '/'\n"," if not os.path.exists(dir_path):\n"," os.makedirs(dir_path)\n"," f = open(dir_path + str(n) + '.tsv', 'w+')\n"," f.truncate(0)\n"," # перебор столбцов, кроме первого, в текущей строке данных\n"," for i in range(1, len(row)):\n"," #конкретная ячейка\n"," text = row[i]\n"," tokens = tokenize_text(text)\n"," prev = [0, '', '']\n"," # перебор токенов\n"," for w in tokens:\n"," # проверка, что тип токена не пробел\n"," if w[1] != 'whitespace':\n"," # запись строки со значениями типа токена, самого токена, стеммы токена, леммы токена\n"," f.write(w[1] + '\\t' + w[2] + '\\t' + stemmer.stem(w[2]) + \"\\t\" + lemmatizer.lemmatize(w[2], get_wordnet_pos(w[2])) + '\\n')\n"," # проверка, что последний токен (prev[2]) находится в списке end_of_clause (конец предложения)\n"," elif prev[2] in end_of_clause:\n"," f.write('\\n')\n"," # текущий токен\n"," prev = w\n"," f.write('\\n')\n"," f.close()\n"," except Exception as e:\n"," # печать ошибки\n"," print(e)\n"," # вывод информации о номере, тексте и токенах, связанных с возникшей ошибкой\n"," print([n, text, tokens])\n"," pass\n"," n = n + 1\n"," # значение n кратно 1000\n"," if n % 1000 == 0:\n"," # печать прогресса обработки файла в %\n"," print(int(n * 100 / data_count), '%')"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":5,"status":"ok","timestamp":1705424314768,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"CVQgoFq1fCFs"},"outputs":[],"source":["path_train = os.path.join(os.getcwd(), 'drive', 'MyDrive', 'ITMO', 'sem_3', 'NLP', 'assets', 'dataset', 'train.csv')\n","path_test = os.path.join(os.getcwd(), 'drive', 'MyDrive', 'ITMO', 'sem_3', 'NLP', 'assets', 'dataset', 'test.csv')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":182781,"status":"ok","timestamp":1705429667439,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"d6Wxp0GRcLpf","outputId":"73c896dd-0317-4fc7-e6d6-47044b82ba33"},"outputs":[],"source":["operation_with_file(path_train)\n","# operation_with_file(path_test)"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":6,"status":"aborted","timestamp":1705424445426,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"PhsTY87QGWlT"},"outputs":[],"source":["# from google.colab import drive\n","# drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":6,"status":"aborted","timestamp":1705424445426,"user":{"displayName":"Maria Solodkaya","userId":"01009481737218333062"},"user_tz":-180},"id":"2cgYEfZCFR8D"},"outputs":[],"source":["# path_train = os.path.join(os.getcwd(), 'drive', 'MyDrive', 'ITMO', 'sem_3', 'NLP', 'assets', 'annotated-corpus', 'train')\n","# path_test = os.path.join(os.getcwd(), 'drive', 'MyDrive', 'ITMO', 'sem_3', 'NLP', 'assets', 'annotated-corpus', 'test')\n","\n","# !zip -r /content/drive/MyDrive/ITMO/sem_3/NLP/assets/annotated-corpus/test /content/drive/MyDrive/ITMO/sem_3/NLP/assets/annotated-corpus/test\n","# !zip -r /content/drive/MyDrive/ITMO/sem_3/NLP/assets/annotated-corpus/train /content/drive/MyDrive/ITMO/sem_3/NLP/assets/annotated-corpus/train\n","\n","\n","# from google.colab import files\n","# files.download(\"/content/drive/MyDrive/ITMO/sem_3/NLP/assets/annotated-corpus/train.zip\")\n","# files.download(\"/content/drive/MyDrive/ITMO/sem_3/NLP/assets/annotated-corpus/test.zip\")\n"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyMc76h0wYt1RLowgJsOnUef","mount_file_id":"1PqAp27TjD1x_Q47GnPwE4DQfW0peNsxp","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
Loading