Skip to content

Commit 9901a28

Browse files
committed
Fix: Use utf-16 encoding in ContentReader::value_at
1 parent 199d840 commit 9901a28

File tree

3 files changed

+33
-4
lines changed

3 files changed

+33
-4
lines changed

vhdl_lang/src/data/contents.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
use super::latin_1::{char_to_latin1, Latin1String, Utf8ToLatin1Error};
88
use super::source::{Position, Range};
9+
use itertools::Itertools;
910
use std::fs::File;
1011
use std::io;
1112
use std::io::prelude::Read;
@@ -353,10 +354,9 @@ impl<'a> ContentReader<'a> {
353354

354355
pub fn value_at(&self, line: usize, start: usize, stop: usize) -> Option<Latin1String> {
355356
let line = self.contents.get_line(line)?;
356-
if stop > line.len() {
357-
return None;
358-
}
359-
Latin1String::from_utf8(&line[start..stop]).ok()
357+
let utf16_view: Box<[u16]> = line.encode_utf16().get(start..stop).collect();
358+
let str_view = String::from_utf16(&utf16_view).ok()?;
359+
Latin1String::from_utf8(&str_view).ok()
360360
}
361361
}
362362

vhdl_lang/src/data/source.rs

+1
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ pub struct Position {
185185
/// Line (zero-based).
186186
pub line: u32,
187187
/// Column (zero-based).
188+
/// The character offset is utf-16 encoded
188189
pub character: u32,
189190
}
190191

vhdl_lang/src/syntax/tokens/tokenizer.rs

+28
Original file line numberDiff line numberDiff line change
@@ -2651,6 +2651,34 @@ my_other_ident",
26512651
);
26522652
}
26532653

2654+
#[test]
2655+
fn non_ascii_before_bit_string() {
2656+
let code = Code::new("€X\"ABC\"");
2657+
let (tokens, _) = code.tokenize_result();
2658+
assert_eq!(
2659+
tokens,
2660+
vec![
2661+
Err(Diagnostic::syntax_error(
2662+
code.s1("€"),
2663+
"Found invalid latin-1 character '€'",
2664+
)),
2665+
Ok(Token {
2666+
kind: BitString,
2667+
value: Value::BitString(
2668+
Latin1String::from_utf8_unchecked("X\"ABC\""),
2669+
ast::BitString {
2670+
length: None,
2671+
base: BaseSpecifier::X,
2672+
value: Latin1String::from_utf8_unchecked("ABC"),
2673+
}
2674+
),
2675+
pos: code.s1("X\"ABC\"").pos(),
2676+
comments: None,
2677+
})
2678+
],
2679+
);
2680+
}
2681+
26542682
#[test]
26552683
fn tokenize_based_integer() {
26562684
assert_eq!(

0 commit comments

Comments
 (0)