Horizon – Diff between revs 11 and 13
?pathlinks?
Rev 11 | Rev 13 | |||
---|---|---|---|---|
Line 1... | Line 1... | |||
1 | using Horizon.Database; |
1 | using Horizon.Database; |
|
2 | using Serilog; |
2 | using Serilog; |
|
3 | using System; |
3 | using System; |
|
4 | using System.Collections.Generic; |
4 | using System.Collections.Generic; |
|
5 | using System.Data.SQLite; |
5 | using System.Data.SQLite; |
|
- | 6 | using System.Diagnostics; |
||
6 | using System.Drawing; |
7 | using System.Drawing; |
|
7 | using System.IO; |
8 | using System.IO; |
|
8 | using System.Linq; |
9 | using System.Linq; |
|
- | 10 | using System.Runtime.CompilerServices; |
||
9 | using System.Text; |
11 | using System.Text; |
|
- | 12 | using System.Text.RegularExpressions; |
||
10 | using System.Threading; |
13 | using System.Threading; |
|
11 | using System.Threading.Tasks; |
14 | using System.Threading.Tasks; |
|
- | 15 | using System.Threading.Tasks.Dataflow; |
||
- | 16 | using Tesseract; |
||
12 | using TrackedFolders; |
17 | using TrackedFolders; |
|
Line 13... | Line 18... | |||
13 | |
18 | |
|
14 | namespace Horizon |
19 | namespace Horizon |
|
15 | { |
20 | { |
|
16 | public static class Extensions |
21 | public static class Extensions |
|
- | 22 | { |
||
- | 23 | private static readonly SemaphoreSlim _tesseractSemaphoreSlim = new SemaphoreSlim(1, 1); |
||
- | 24 | |
||
- | 25 | private static readonly Stopwatch _tesseractStopWatch = new Stopwatch(); |
||
- | 26 | |
||
- | 27 | private static readonly TesseractEngine _tesseractEngine = new TesseractEngine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "tessdata"), "eng", EngineMode.Default); |
||
- | 28 | |
||
- | 29 | /// <summary> |
||
- | 30 | /// https://stackoverflow.com/questions/7311734/split-sentence-into-words-but-having-trouble-with-the-punctuations-in-c-sharp |
||
- | 31 | /// </summary> |
||
- | 32 | private static readonly Regex _splitWordRegex = new Regex(@"((\b[^\s]+\b)((?<=\.\w).)?)", RegexOptions.Compiled); |
||
17 | { |
33 | |
|
18 | public static async Task TakeSnapshot(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, CancellationToken cancellationToken) |
34 | public static async Task TakeSnapshot(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, CancellationToken cancellationToken) |
|
19 | { |
35 | { |
|
20 | foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.TopDirectoryOnly)) |
36 | foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.TopDirectoryOnly)) |
|
21 | { |
37 | { |
|
Line 72... | Line 88... | |||
72 | { |
88 | { |
|
73 | Log.Error(exception, $"Could not take snapshot of file: {file}"); |
89 | Log.Error(exception, $"Could not take snapshot of file: {file}"); |
|
74 | } |
90 | } |
|
75 | } |
91 | } |
|
76 | } |
92 | } |
|
- | 93 | |
||
- | 94 | public static async IAsyncEnumerable<string> RecognizeStrings(Bitmap screenCapture, [EnumeratorCancellation] CancellationToken cancellationToken) |
||
- | 95 | { |
||
- | 96 | var bufferBlock = new BufferBlock<string>(new DataflowBlockOptions { CancellationToken = cancellationToken, EnsureOrdered = false }); |
||
- | 97 | // tesseract can only process a single image at once |
||
- | 98 | await _tesseractSemaphoreSlim.WaitAsync(); |
||
- | 99 | try |
||
- | 100 | { |
||
- | 101 | // ocr image |
||
- | 102 | using var memoryStream = new MemoryStream(); |
||
- | 103 | screenCapture.Save(memoryStream, System.Drawing.Imaging.ImageFormat.Bmp); |
||
- | 104 | memoryStream.Position = 0L; |
||
- | 105 | var imageData = memoryStream.ToArray(); |
||
- | 106 | using var pix = Pix.LoadFromMemory(imageData); |
||
- | 107 | using var page = _tesseractEngine.Process(pix); |
||
- | 108 | _tesseractStopWatch.Start(); |
||
- | 109 | var text = page.GetText(); |
||
- | 110 | _tesseractStopWatch.Stop(); |
||
- | 111 | var time = _tesseractStopWatch.Elapsed; |
||
- | 112 | |
||
- | 113 | Log.Information($"Tesseract OCR complete in {time}"); |
||
- | 114 | |
||
- | 115 | foreach (var match in _splitWordRegex.Matches(text)) |
||
- | 116 | { |
||
- | 117 | await bufferBlock.SendAsync($"{match}", cancellationToken); |
||
- | 118 | } |
||
- | 119 | |
||
- | 120 | bufferBlock.Complete(); |
||
- | 121 | } |
||
- | 122 | catch (Exception exception) |
||
- | 123 | { |
||
- | 124 | Log.Error(exception, $"Exception thrown while processing images with OCR"); |
||
- | 125 | |
||
- | 126 | bufferBlock.Complete(); |
||
- | 127 | } |
||
- | 128 | finally |
||
- | 129 | { |
||
- | 130 | _tesseractSemaphoreSlim.Release(); |
||
- | 131 | } |
||
- | 132 | |
||
- | 133 | //await bufferBlock.Completion; |
||
- | 134 | while (await bufferBlock.OutputAvailableAsync()) |
||
- | 135 | { |
||
- | 136 | if (bufferBlock.TryReceive(out var term)) |
||
- | 137 | { |
||
- | 138 | yield return term; |
||
- | 139 | } |
||
- | 140 | } |
||
- | 141 | } |
||
77 | } |
142 | } |
|
78 | } |
143 | } |