-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScript.fsx
276 lines (248 loc) · 8.31 KB
/
Script.fsx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#r "nuget: Npgsql, 6.0.3"
#r "nuget: Dapper, 2.0.123"
#r "nuget: FSharp.Control.AsyncSeq, 3.2.1"
open System
open System.IO
open System.Threading
open System.Threading.Tasks
open System.Text.RegularExpressions
open System.Collections.Generic
open System.Text.Json
open FSharp.Control
type Word = string
type Repeat = uint32
type NextWords = Map<Word, Repeat>
type Entry =
| Start of NextWords
| Word of NextWords
| End
type Window =
| Window1 of firstComponent:Word
| Window2 of firstComponent:Word * secondComponent:Word
type Entries = Map<Window, Entry>
type Corpus = { CorpusId: int64; Name: string }
[<RequireQualifiedAccess>]
module NextWords =
let mostRepeatableWord (nw: NextWords) : Word option =
let most = nw |> Seq.sortByDescending (fun x -> x.Value) |> Seq.tryHead
match most with
| Some w -> Some w.Key
| None -> None
let randomWord (nw: NextWords) : Word option =
let most = nw |> Seq.map (fun p -> (Guid.NewGuid(), p.Key)) |> Seq.sortBy fst |> Seq.tryHead
match most with
| Some (_, w) -> Some w
| None -> None
let add (w: Word) (nw: NextWords) : NextWords =
let repeat =
match Map.tryFind w nw with
| Some r -> r
| None -> 0u
Map.add w (repeat + 1u) nw
let ofEntry = function
| Start nw -> nw
| Word nw -> nw
| End -> Map.empty
[<RequireQualifiedAccess>]
module Window =
let inline first win =
match win with
| Window1 x -> x
| Window2 (f, _) -> f
let toWord = function
| Window1 x -> x
| Window2 (f, s) -> $"{f} {s}"
let toArray = function
| Window1 x -> [| x |]
| Window2 (f, s) -> [| f; s |]
let ofWord (w: Word) = Window1 w
[<RequireQualifiedAccess>]
module Entry =
let inline create (existing: Entry option) (prev: Entry option) (words: NextWords) : Entry =
match prev with
| Some (End)
-> Start words
| Some (Word _) ->
match existing with
| Some (Start _) ->
Start words
| Some (Word _)
| None ->
Word words
| Some End ->
End
| Some (Start _)
| None
-> Word words
module EntryParser =
[<Literal>]
let endWord = "."
let (|WordToken|_|) str =
if str <> endWord
then Some str
else None
let (|EndToken|_|) str =
if str = endWord
then Some endWord
else None
let private createEntry w nextw prevEntry dict =
let addNextWord =
match nextw with
| Some nw -> NextWords.add nw
| None -> fun x -> x
match Map.tryFind w dict with
| Some e ->
NextWords.ofEntry e
|> addNextWord
|> Entry.create (Some e) prevEntry
| None ->
Map.empty
|> addNextWord
|> Entry.create None prevEntry
let private window1Parser tokens prevEntry dict =
match tokens with
// a b
| (WordToken w)::nextw::tail ->
let win = Window1 w
Some (win, createEntry win (Some nextw) prevEntry dict, nextw::tail)
// . a
| (EndToken w)::nextw::tail ->
Some (Window1 w, End, nextw::tail)
// a $
| (WordToken w)::tail ->
let win = Window1 w
Some (win, createEntry win None prevEntry dict, tail)
// . $
| (EndToken w)::tail ->
Some (Window1 w, End, tail)
| _ ->
None
let private window2Parser tokens prevEntry dict =
match tokens with
// a b c
// a b .
| (WordToken w1)::(WordToken w2)::nextw::tail ->
let w = Window2 (w1, w2)
Some (w, createEntry w (Some nextw) prevEntry dict, w2::nextw::tail)
// a . b
| (WordToken w1)::(EndToken w2)::nextw::tail ->
Some (Window1 w1, End, w2::nextw::tail)
// . b c
| (EndToken w)::w2::nextw::tail ->
Some (Window1 w, End, w2::nextw::tail)
// a b $
// $ - eol
| (WordToken w1)::(WordToken w2)::tail ->
let w = Window2 (w1, w2)
Some (w, createEntry w None prevEntry dict, w2::tail)
// a . $
| (WordToken w1)::(EndToken w2)::tail ->
Some (Window1 w1, End, w2::tail)
// . b $
| (EndToken w)::nextw::tail ->
Some (Window1 w, End, nextw::tail)
// a $
| (WordToken w)::tail ->
let win = Window1 w
Some (win, createEntry win None prevEntry dict, tail)
// . $
| (EndToken w)::tail ->
Some (Window1 w, End, tail)
| _ ->
None
let private parseEntries windowParser (seed: Map<Window, Entry>) (tokens: string list): Map<Window, Entry> =
let rec parse tokens prevEntry dict =
match windowParser tokens prevEntry dict with
| None ->
dict
| Some (w, entry, tokens) ->
dict
|> Map.add w entry
|> parse tokens (Some entry)
parse tokens None seed
let parseEntries1 = parseEntries window1Parser
let parseEntries2 = parseEntries window2Parser
let parseTokens strs =
strs
|> Seq.filter (String.IsNullOrWhiteSpace >> not)
|> Seq.map(fun x -> x.ToLowerInvariant())
|> Seq.toList
open EntryParser
let getEntriesFromTxt () =
use f = File.OpenText("./corpus.txt")
let mutable tokens = Map.empty
while f.EndOfStream |> not do
tokens <-
Regex.Split(f.ReadLine(), "(\s|\.)", RegexOptions.CultureInvariant ||| RegexOptions.IgnoreCase)
|> parseTokens
|> parseEntries1 tokens
tokens
let getEntriesFromJson () =
use f = File.OpenText("./result.json")
use doc = JsonDocument.Parse(f.ReadToEnd())
let msgs = doc.RootElement.GetProperty("messages").EnumerateArray()
let mutable tokens = []
for message in msgs do
let m = message.GetProperty("text")
if m.ValueKind = JsonValueKind.String then
tokens <-
tokens @
(Regex.Split($"{endWord}{m.GetString()}{endWord}", "(\s|\.)", RegexOptions.CultureInvariant ||| RegexOptions.IgnoreCase)
|> parseTokens)
tokens
let dict = getEntriesFromJson () |> parseEntries2 Map.empty
dict
|> Map.iter (fun k v -> printfn "%A: %A" k v)
type Phrase = Phrase of Word list
[<RequireQualifiedAccess>]
module Phrase =
let empty = Phrase []
let add (w: Word) (Phrase ws) : Phrase = Phrase (ws @ [w])
let addFromWindow (win: Window) (p: Phrase) : Phrase = add (win |> Window.toWord) p
let toString (Phrase ws) = String.Join (" ", ws)
let genPhrase (dict: Map<Window, Entry>) : Phrase =
let random = Random()
let tryRandItem (keys: 'a[]) : 'a option =
if Array.length keys > 0
then Some (keys.[random.Next(0, keys.Length)])
else None
let inline findPairByWindow (win: Window) (dict: Map<Window, Entry>) =
let randomPair =
dict
|> Seq.filter (fun e -> e.Key = win || ((Window.first e.Key) = (Window.first win)))
|> Seq.toArray
|> tryRandItem
match randomPair with
| Some pair -> Some (pair.Key, pair.Value)
| None -> None
let rec gen (result: Phrase) (win: Window): Phrase =
match Window.toWord win with
| EndToken _ ->
result
| _ ->
match findPairByWindow win dict with
| Some (key, entry) ->
let randWord =
entry
|> NextWords.ofEntry
|> NextWords.randomWord
let updatedPhrase = Phrase.addFromWindow key result
match randWord with
| None -> updatedPhrase
| Some token -> gen updatedPhrase (Window.ofWord token)
| None ->
Phrase.addFromWindow win result
let randStartKey =
dict
|> Seq.choose (fun p ->
match p.Value with
| Entry.Start _ -> Some p.Key
| _ -> None)
|> Seq.toArray
|> tryRandItem
match randStartKey with
| None -> Phrase.empty
| Some key -> gen Phrase.empty key
dict
|> genPhrase
|> Phrase.toString