Horizon – Rev 13
?pathlinks?
using Horizon.Database;
using Serilog;
using System;
using System.Collections.Generic;
using System.Data.SQLite;
using System.Diagnostics;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using Tesseract;
using TrackedFolders;
namespace Horizon
{
public static class Extensions
{
private static readonly SemaphoreSlim _tesseractSemaphoreSlim = new SemaphoreSlim(1, 1);
private static readonly Stopwatch _tesseractStopWatch = new Stopwatch();
private static readonly TesseractEngine _tesseractEngine = new TesseractEngine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "tessdata"), "eng", EngineMode.Default);
/// <summary>
/// https://stackoverflow.com/questions/7311734/split-sentence-into-words-but-having-trouble-with-the-punctuations-in-c-sharp
/// </summary>
private static readonly Regex _splitWordRegex = new Regex(@"((\b[^\s]+\b)((?<=\.\w).)?)", RegexOptions.Compiled);
public static async Task TakeSnapshot(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, CancellationToken cancellationToken)
{
foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.TopDirectoryOnly))
{
try
{
var fileName = Path.GetFileName(file);
var directory = Path.GetDirectoryName(fileName);
var color = Color.Empty;
if (trackedFolders.TryGet(directory, out var folder))
{
color = folder.Color;
}
await snapshotDatabase.CreateSnapshotAsync(fileName, file, color, cancellationToken);
}
catch (SQLiteException exception)
{
if (exception.ResultCode == SQLiteErrorCode.Constraint)
{
Log.Information(exception, "Snapshot already exists.");
}
}
catch (Exception exception)
{
Log.Error(exception, $"Could not take snapshot of file: {file}");
}
}
}
public static async Task TakeSnapshotRecursive(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, CancellationToken cancellationToken)
{
foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.AllDirectories))
{
try
{
var fileName = Path.GetFileName(file);
var directory = Path.GetDirectoryName(fileName);
var color = Color.Empty;
if (trackedFolders.TryGet(directory, out var folder))
{
color = folder.Color;
}
await snapshotDatabase.CreateSnapshotAsync(fileName, file, color, cancellationToken);
}
catch (SQLiteException exception)
{
if (exception.ResultCode == SQLiteErrorCode.Constraint)
{
Log.Information(exception, "Snapshot already exists.");
}
}
catch (Exception exception)
{
Log.Error(exception, $"Could not take snapshot of file: {file}");
}
}
}
public static async IAsyncEnumerable<string> RecognizeStrings(Bitmap screenCapture, [EnumeratorCancellation] CancellationToken cancellationToken)
{
var bufferBlock = new BufferBlock<string>(new DataflowBlockOptions { CancellationToken = cancellationToken, EnsureOrdered = false });
// tesseract can only process a single image at once
await _tesseractSemaphoreSlim.WaitAsync();
try
{
// ocr image
using var memoryStream = new MemoryStream();
screenCapture.Save(memoryStream, System.Drawing.Imaging.ImageFormat.Bmp);
memoryStream.Position = 0L;
var imageData = memoryStream.ToArray();
using var pix = Pix.LoadFromMemory(imageData);
using var page = _tesseractEngine.Process(pix);
_tesseractStopWatch.Start();
var text = page.GetText();
_tesseractStopWatch.Stop();
var time = _tesseractStopWatch.Elapsed;
Log.Information($"Tesseract OCR complete in {time}");
foreach (var match in _splitWordRegex.Matches(text))
{
await bufferBlock.SendAsync($"{match}", cancellationToken);
}
bufferBlock.Complete();
}
catch (Exception exception)
{
Log.Error(exception, $"Exception thrown while processing images with OCR");
bufferBlock.Complete();
}
finally
{
_tesseractSemaphoreSlim.Release();
}
//await bufferBlock.Completion;
while (await bufferBlock.OutputAvailableAsync())
{
if (bufferBlock.TryReceive(out var term))
{
yield return term;
}
}
}
}
}