-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHocrHeaderFixer.cs
28 lines (26 loc) · 1 KB
/
HocrHeaderFixer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
class HocrHeaderFixer : IHocrXmlProcessor
{
public void Init()
{
Console.WriteLine("Fixing hOCR file headers...");
}
public void Process(string hocrFile, XDocument hocrXml)
{
XNamespace ns = "http://www.w3.org/1999/xhtml";
var head = hocrXml.Element(ns + "html").Element(ns + "head");
head.Element(ns + "title").Value = "Image: " + Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_JP2.jpg");
head.Add(new XElement(ns + "meta", new XAttribute("name", "ocr-system"), new XAttribute("content", "Transkribus")));
var writer = XmlWriter.Create(File.Open(hocrFile, FileMode.Truncate), new XmlWriterSettings
{
// need to specify false here to stop it from emitting a byte order mark
Encoding = new UTF8Encoding(false),
Indent = true
});
hocrXml.Save(writer);
writer.Close();
}
}