Horizon – Blame information for rev 15

Subversion Repositories:
Rev:
Rev Author Line No. Line
11 office 1 using Horizon.Database;
15 office 2 using Horizon.Searching;
3 using Horizon.Snapshots;
11 office 4 using Serilog;
5 using System;
6 using System.Collections.Generic;
7 using System.Data.SQLite;
13 office 8 using System.Diagnostics;
11 office 9 using System.Drawing;
10 using System.IO;
11 using System.Linq;
13 office 12 using System.Runtime.CompilerServices;
11 office 13 using System.Text;
13 office 14 using System.Text.RegularExpressions;
11 office 15 using System.Threading;
16 using System.Threading.Tasks;
13 office 17 using System.Threading.Tasks.Dataflow;
18 using Tesseract;
11 office 19 using TrackedFolders;
20  
21 namespace Horizon
22 {
23 public static class Extensions
24 {
13 office 25 private static readonly SemaphoreSlim _tesseractSemaphoreSlim = new SemaphoreSlim(1, 1);
26  
27 private static readonly Stopwatch _tesseractStopWatch = new Stopwatch();
28  
29 private static readonly TesseractEngine _tesseractEngine = new TesseractEngine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "tessdata"), "eng", EngineMode.Default);
30  
31 /// <summary>
32 /// https://stackoverflow.com/questions/7311734/split-sentence-into-words-but-having-trouble-with-the-punctuations-in-c-sharp
33 /// </summary>
34 private static readonly Regex _splitWordRegex = new Regex(@"((\b[^\s]+\b)((?<=\.\w).)?)", RegexOptions.Compiled);
35  
15 office 36 public static async Task TakeSnapshot(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, SearchEngine searchEngine, [EnumeratorCancellation] CancellationToken cancellationToken)
11 office 37 {
38 foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.TopDirectoryOnly))
39 {
15 office 40 var fileName = Path.GetFileName(file);
41 var directory = Path.GetDirectoryName(fileName);
42 var color = Color.Empty;
43 if (trackedFolders.TryGet(directory, out var folder))
44 {
45 color = folder.Color;
46 }
47  
11 office 48 try
49 {
15 office 50  
51 if(await snapshotDatabase.CreateSnapshotAsync(fileName, file, color, cancellationToken) is Snapshot snapshot)
11 office 52 {
15 office 53 await searchEngine.Index(snapshot, cancellationToken);
11 office 54 }
55 }
56 catch (SQLiteException exception)
57 {
58 if (exception.ResultCode == SQLiteErrorCode.Constraint)
59 {
60 Log.Information(exception, "Snapshot already exists.");
61 }
62 }
63 catch (Exception exception)
64 {
65 Log.Error(exception, $"Could not take snapshot of file: {file}");
66 }
67 }
68 }
69  
15 office 70 public static async Task TakeSnapshotRecursive(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, SearchEngine searchEngine, [EnumeratorCancellation] CancellationToken cancellationToken)
11 office 71 {
72 foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.AllDirectories))
73 {
15 office 74 var fileName = Path.GetFileName(file);
75 var directory = Path.GetDirectoryName(fileName);
76 var color = Color.Empty;
77 if (trackedFolders.TryGet(directory, out var folder))
78 {
79 color = folder.Color;
80 }
81  
11 office 82 try
83 {
15 office 84  
85 if (await snapshotDatabase.CreateSnapshotAsync(fileName, file, color, cancellationToken) is Snapshot snapshot)
11 office 86 {
15 office 87 await searchEngine.Index(snapshot, cancellationToken);
11 office 88 }
89 }
90 catch (SQLiteException exception)
91 {
92 if (exception.ResultCode == SQLiteErrorCode.Constraint)
93 {
94 Log.Information(exception, "Snapshot already exists.");
95 }
96 }
97 catch (Exception exception)
98 {
99 Log.Error(exception, $"Could not take snapshot of file: {file}");
100 }
101 }
102 }
13 office 103  
104 public static async IAsyncEnumerable<string> RecognizeStrings(Bitmap screenCapture, [EnumeratorCancellation] CancellationToken cancellationToken)
105 {
106 var bufferBlock = new BufferBlock<string>(new DataflowBlockOptions { CancellationToken = cancellationToken, EnsureOrdered = false });
107 // tesseract can only process a single image at once
108 await _tesseractSemaphoreSlim.WaitAsync();
109 try
110 {
111 // ocr image
112 using var memoryStream = new MemoryStream();
113 screenCapture.Save(memoryStream, System.Drawing.Imaging.ImageFormat.Bmp);
114 memoryStream.Position = 0L;
115 var imageData = memoryStream.ToArray();
116 using var pix = Pix.LoadFromMemory(imageData);
117 using var page = _tesseractEngine.Process(pix);
118 _tesseractStopWatch.Start();
119 var text = page.GetText();
120 _tesseractStopWatch.Stop();
121 var time = _tesseractStopWatch.Elapsed;
122  
123 Log.Information($"Tesseract OCR complete in {time}");
124  
125 foreach (var match in _splitWordRegex.Matches(text))
126 {
127 await bufferBlock.SendAsync($"{match}", cancellationToken);
128 }
129  
130 bufferBlock.Complete();
131 }
132 catch (Exception exception)
133 {
134 Log.Error(exception, $"Exception thrown while processing images with OCR");
135  
136 bufferBlock.Complete();
137 }
138 finally
139 {
140 _tesseractSemaphoreSlim.Release();
141 }
142  
143 //await bufferBlock.Completion;
144 while (await bufferBlock.OutputAvailableAsync())
145 {
146 if (bufferBlock.TryReceive(out var term))
147 {
148 yield return term;
149 }
150 }
151 }
11 office 152 }
153 }