HOPE ART is an educational platform for ages 2-5, developed by Humanitarian Operations (HOPE). It uses AR technology to deliver immersive experiences and lessons from nursery to other subjects like music.
I’ve been part of the Android app’s development, where I got to work on different game prototypes while implementing optimisations to ensure a smooth performance at nursery testing sessions.
Currently, I am working on the PC game, which uses advanced computer vision algorithms such as hand tracking, gesture tracking and human segmentation to make lessons more immersive.
I am currently working on the PC game, which uses advanced computer vision algorithms such as hand tracking, gesture tracking and human segmentation to make lessons more immersive.
I achieved this by building a two-way TCP connection between the Python and Unity program. Initially, the process went like this:
Video capture is done in Unity and sent to Python, Python receives and performs image processing on the image, then results (track data) are sent back and simulated in Unity.
However, the process of encoding the video capture in Unity caused significant lag due to setting new pixels to a Texture2D each frame.
I fixed this by performing the video capture in Python instead. This only became more performant but also simplified the communication, making it one-way.
In addition, this eliminated the need to transfer image bytes twice. This boosted performance from 30 FPS to 200+ FPS!
The following code is used for the sending of track data and camera feed over TCP communication in Python. The camFeedSender instance is used by the HumanSegmentation.py class to send the cam feed and the trackDataSender is used by HandGestureTracker and HandProcessor for sending results to Unity to simulate.
import socket
import struct
# Communication Vars
IP = "127.0.0.1" # Local
CAM_FEED_PORT, TRACK_DATA_PORT = 8000, 8001
# Class for establishing a TCP socket connection
class TCPSocket:
def __init__(self, IP, PORT):
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.sock.connect((IP, PORT))
def Send(self, data):
self.sock.sendall(data) # Send the compressed frame data
# Class to send image buffer over TCP
class CamFeedSender(TCPSocket):
def SendImgBuffer(self, buffer):
# Pack the frame size and send it in big-endian format
self.Send(struct.pack("!L", len(buffer)))
self.Send(buffer.tobytes()) # Send the compressed frame data
# Class to send string data over TCP
class TrackDataSender(TCPSocket):
def SendStr(self, str):
self.Send(str.encode())
camFeedSender = CamFeedSender(IP, CAM_FEED_PORT)
trackDataSender = TrackDataSender(IP, TRACK_DATA_PORT)
The following code is used for receiving track data and camera feed from TCP communication in Unity. The TcpSocket and TcpReceiver has been separated to allow for future implementation of the TcpSender for sending requests to Python.
using UnityEngine;
using System.Net;
using System.Net.Sockets;
using System.Threading;
public abstract class TcpSocket : MonoBehaviour
{
const string IP = "127.0.0.1"; // Local host
protected abstract int PORT { get; }
Thread streamThread;
TcpListener listener;
protected TcpClient client;
protected NetworkStream stream;
void Start()
{
// Create receiving thread to prevent blocking
StartThread(ref streamThread, new(StreamThreadStart));
void StartThread(ref Thread t, ThreadStart ts)
{
t = new Thread(ts);
t.IsBackground = true;
t.Start();
}
}
protected virtual void StreamThreadStart()
{
ListenForConnection();
}
protected virtual void ListenForConnection()
{
// Listen for connection
listener = new TcpListener(IPAddress.Parse(IP), PORT);
listener.Start();
// Connect to Python program
client = listener.AcceptTcpClient();
stream = client.GetStream();
}
// Properly end communication
protected virtual void EndComms()
{
streamThread?.Abort();
listener?.Stop();
client?.Close();
stream?.Close();
}
private void OnDestroy() => EndComms();
private void OnApplicationQuit() => EndComms();
}
using System;
using System.IO;
using UnityEngine;
public abstract class TcpReceiver : TcpSocket
{
protected BinaryReader binaryReader;
public bool active = true;
protected abstract void ReadData();
protected override void ListenForConnection()
{
base.ListenForConnection();
binaryReader = new(stream);
}
protected override void StreamThreadStart()
{
base.StreamThreadStart();
// Keep receiving data until functionality to stop
while (active)
{
// Handle exception errors
try
{
ReadData();
}
catch (Exception err)
{
Debug.LogWarning(err.ToString());
}
}
}
// Properly end communication
protected override void EndComms()
{
base.EndComms();
binaryReader?.Close();
}
}
using PimDeWitte.UnityMainThreadDispatcher;
using System;
using System.IO;
public class CamFeedReceiver : TcpReceiver
{
protected override int PORT => 8000;
public event Action OnReceived;
public static CamFeedReceiver Instance { get; private set; }
private void Awake()
{
Instance = this;
}
protected override void ReadData()
{
// First receive image size then data
int frameSize = (int)ReadUInt32BigEndian(binaryReader);
byte[] frameBytes = binaryReader.ReadBytes(frameSize);
// Call on main thread
UnityMainThreadDispatcher.Instance().Enqueue(
() => OnReceived?.Invoke(frameBytes)); // CamDisplay.cs is listening for frames to display
uint ReadUInt32BigEndian(BinaryReader reader)
{
byte[] bytes = reader.ReadBytes(4);
Array.Reverse(bytes);
return BitConverter.ToUInt32(bytes, 0);
}
}
}
using PimDeWitte.UnityMainThreadDispatcher;
using System;
using System.Text;
public class TrackDataReceiver : TcpReceiver
{
protected override int PORT => 8001;
public event Action OnReceived;
public static TrackDataReceiver Instance { get; private set; }
private void Awake()
{
Instance = this;
}
protected override void ReadData()
{
byte[] bytes = new byte[client.ReceiveBufferSize];
int bytesRead = stream.Read(bytes, 0, bytes.Length); // Get bytes length to read whole data
string data = Encoding.UTF8.GetString(bytes, 0, bytesRead); // Convert bytes to string
// Call on main thread
UnityMainThreadDispatcher.Instance().Enqueue(
() => OnReceived?.Invoke(data)); // TrackDataReader.cs is listening for track data to deserialse
}
}
Currently, I am adding back the two-way connection to allow Unity to request Python to perform specific types of tracking. For example, there are times when we want only human segmentation and hand tracking.
In order to simulate the received track data in Unity, we needs to be able to interpret it. By pairing track data with a tag, we can know what type of data we are receiving. The format of the sent data goes like this:
Format:
(TAG, [[TRACK_DATA1], [TRACK_DATA2]])
Example:
(2, [['Open'], ['Close']])
In this example, this is data contains the gesture of two different hands and is categorised as tag number 2, which is converted to enum in Unity to group datasets.
The following code is used for reading received track data and sorting them into a dictionary by tag.
using System;
using System.Collections.Generic;
using UnityEngine;
public class TrackDataReader : MonoBehaviour
{
public enum Tag { HandLm, HandHandedness, HandGesture }
public Dictionary>> Datasets { get; private set; }
public static TrackDataReader Instance { get; private set; }
private void Awake()
{
Instance = this;
Datasets = new();
foreach (Tag tag in Enum.GetValues(typeof(Tag)))
{
Datasets.Add(tag, default);
}
}
private void Start()
{
TrackDataReceiver.Instance.OnReceived += ReadDatasets;
}
int startI, endI;
Tag tag;
HashSet seenTags = new();
List dataset = new();
int dataCount;
int datasetI;
void ReadDatasets(string str)
{
startI = 0;
endI = 0;
seenTags.Clear();
// For each dataset
while ((startI = str.IndexOf('(', endI)) != -1)
{
// Get tag
startI++;
endI = str.IndexOf(',', startI) - 1;
tag = (Tag)int.Parse(SubstringSelect()); // Tag no
// Only check each tag once
if (seenTags.Contains(tag))
break;
else
seenTags.Add(tag);
// Get dataset
startI = endI + 3 + 1;
dataset.Clear();
dataCount = 1;
datasetI = startI;
do
{
int nextOpen = str.IndexOf('[', datasetI),
nextClose = str.IndexOf(']', datasetI);
if (nextClose < nextOpen || nextOpen == -1)
{
datasetI = nextClose;
dataCount--;
}
else if (nextOpen < nextClose)
{
datasetI = nextOpen;
dataCount++;
}
if (dataCount == 1)
{
endI = datasetI;
dataset.Add(SubstringSelect());
startI = str.IndexOf('[', datasetI);
}
datasetI++;
}
while (dataCount != 0);
// Broadcast dataset
Datasets[tag]?.Invoke(dataset);
string SubstringSelect() => str.Substring(startI, endI - startI + 1);
}
}
}
By storing events by tag into the dictionary, it’s nice and easy for other classes to listen for deserislised track data. This is an example of the HandReader listening for hand data:
var trackDataReader = TrackDataReader.Instance;
trackDataReader.Datasets[TrackDataReader.Tag.HandLm] += SerialiseHandLm;
trackDataReader.Datasets[TrackDataReader.Tag.HandHandedness] += SerialiseHandedness;
trackDataReader.Datasets[TrackDataReader.Tag.HandGesture] += SerialiseHandGesture;
using UnityEngine;
public class HandSync : MonoBehaviour
{
[SerializeField] Canvas canvas;
Camera cam;
[SerializeField] Transform humanSegT;
[System.Flags]
public enum SegTransformChange
{
Position = 1, // 000001
Rotation = 2, // 000010
Scale = 4 // 000100
}
[SerializeField] SegTransformChange segTransformChanges;
public static HandSync Instance { get; private set; }
public void Awake()
{
Instance = this;
cam = Camera.main;
}
// TODO: Make it work with cmaera transform changes
public Vector3 ViewPortToOverlayPos(Vector3 viewPort)
{
float depth = viewPort.z;
viewPort *= canvas.scaleFactor;
viewPort.z += cam.nearClipPlane;
Ray ray = cam.ViewportPointToRay(viewPort);
Vector3 worldPos = ray.origin + ray.direction * (-ray.origin.z / ray.direction.z);
worldPos += depth * Vector3.Distance(ray.origin, worldPos) * ray.direction;
// Selective Transform Point
{
var worldToLocalMatrix = Matrix4x4.TRS(
ChangeIsEnabled(SegTransformChange.Position) ? humanSegT.position : Vector3.one,
ChangeIsEnabled(SegTransformChange.Rotation) ? humanSegT.rotation : Quaternion.identity,
ChangeIsEnabled(SegTransformChange.Scale) ? humanSegT.localScale : Vector3.one);
worldPos = worldToLocalMatrix.MultiplyPoint3x4(worldPos);
bool ChangeIsEnabled(SegTransformChange transformChange) => (segTransformChanges & transformChange) != 0; // If there is no match of 1s, then it is not present in bit mask
}
return worldPos;
}
}
using UnityEditor;
using UnityEngine;
using LmTag = HandReader.Hand.LmTag;
public class HandDrawer : MonoBehaviour
{
HandReader reader;
[System.Flags]
public enum DrawType
{
Landmarks = 1, // 000001
HandLines = 2, // 000010
Bounds = 4, // 000100
Labels = 8, // 001000
Axis = 16 // 010000
}
[SerializeField] public DrawType draws = DrawType.Landmarks | DrawType.HandLines;
private void Start()
{
reader = HandReader.Instance;
}
bool DrawTypeIsEnabled(DrawType draw) => (draws & draw) != 0;
private void OnDrawGizmos()
{
if (!Application.isPlaying)
return;
if (draws == 0)
return;
float lineThickness;
foreach (var h in reader.hands)
{
if (DrawTypeIsEnabled(DrawType.Landmarks))
{
Gizmos.color = Color.yellow;
foreach (var lm in h.Landmarks)
{
Gizmos.DrawSphere(lm, 0.1f);
}
}
if (DrawTypeIsEnabled(DrawType.HandLines))
{
lineThickness = 4;
Handles.color = Color.grey;
HandLine(LmTag.THUMB_CMC, LmTag.THUMB_MCP);
HandLine(LmTag.THUMB_MCP, LmTag.THUMB_IP);
HandLine(LmTag.THUMB_IP, LmTag.THUMB_TIP);
HandLine(LmTag.WRIST, LmTag.INDEX_FINGER_MCP);
HandLine(LmTag.INDEX_FINGER_MCP, LmTag.INDEX_FINGER_PIP);
HandLine(LmTag.INDEX_FINGER_PIP, LmTag.INDEX_FINGER_DIP);
HandLine(LmTag.INDEX_FINGER_DIP, LmTag.INDEX_FINGER_TIP);
HandLine(LmTag.INDEX_FINGER_MCP, LmTag.MIDDLE_FINGER_MCP);
HandLine(LmTag.MIDDLE_FINGER_MCP, LmTag.MIDDLE_FINGER_PIP);
HandLine(LmTag.MIDDLE_FINGER_PIP, LmTag.MIDDLE_FINGER_DIP);
HandLine(LmTag.MIDDLE_FINGER_DIP, LmTag.MIDDLE_FINGER_TIP);
HandLine(LmTag.MIDDLE_FINGER_MCP, LmTag.RING_FINGER_MCP);
HandLine(LmTag.RING_FINGER_MCP, LmTag.RING_FINGER_PIP);
HandLine(LmTag.RING_FINGER_PIP, LmTag.RING_FINGER_DIP);
HandLine(LmTag.RING_FINGER_DIP, LmTag.RING_FINGER_TIP);
HandLine(LmTag.RING_FINGER_MCP, LmTag.PINKY_MCP);
HandLine(LmTag.PINKY_MCP, LmTag.PINKY_PIP);
HandLine(LmTag.PINKY_PIP, LmTag.PINKY_DIP);
HandLine(LmTag.PINKY_DIP, LmTag.PINKY_TIP);
HandLine(LmTag.PINKY_MCP, LmTag.WRIST);
}
if (DrawTypeIsEnabled(DrawType.Bounds))
{
lineThickness = 3;
Handles.color = new Color(255, 0, 255);
// Border
BorderSide(-1, -1, 1, -1);
BorderSide(1, -1, 1, 1);
BorderSide(1, 1, -1, 1);
BorderSide(-1, 1, -1, -1);
// Center
Gizmos.color = new Color(255, 0, 255);
Gizmos.DrawSphere(h.Bounds.center, 0.1f);
}
if (DrawTypeIsEnabled(DrawType.Labels))
{
// Gesture & Handedness
float screenSizeFactor = HandleUtility.GetHandleSize(Vector3.zero) * 0.1f;
GUIStyle style = new(GUI.skin.label);
style.fontStyle = FontStyle.Bold;
style.normal.textColor = new Color(255, 0, 255);
style.fontSize = Mathf.RoundToInt(100 * screenSizeFactor);
Handles.Label(h.GetLm(LmTag.WRIST), h.Gesture.ToString());
Handles.Label(h.GetLm(LmTag.WRIST) + (Vector3.down * 1f), h.Handedness.ToString());
}
if (DrawTypeIsEnabled(DrawType.Axis))
{
lineThickness = 5;
Handles.color = Color.green;
Line(h.GetLm(LmTag.WRIST), h.GetLm(LmTag.WRIST) + h.UpOrientation.Direction);
Handles.color = Color.red;
Line(h.GetLm(LmTag.WRIST), h.GetLm(LmTag.WRIST) + h.RightOrientation.Direction);
Handles.color = Color.blue;
Line(h.GetLm(LmTag.WRIST), h.GetLm(LmTag.WRIST) + h.ForwardOrientation.Direction);
}
// Helpers
void Line(Vector3 start, Vector3 end) => Handles.DrawLine(start, end, lineThickness);
void HandLine(LmTag start, LmTag end) => Line(h.GetLm(start), h.GetLm(end));
void BorderSide(int signStartX, float signStartY, int signEndX, float signEndY)
{
Line(
h.Bounds.center + new Vector3(h.Bounds.extents.x * signStartX, h.Bounds.extents.y * signStartY),
h.Bounds.center + new Vector3(h.Bounds.extents.x * signEndX, h.Bounds.extents.y * signEndY));
}
}
}
}
To make the use of Python program seamless, it is automatically run. I disabled the Python console window from appearing while still retrieving it for debugging.
using System.Diagnostics;
using UnityEngine;
public class PythonProcessor : MonoBehaviour
{
// Path must be consistent per device
const string INTERPRETER_PATH = @"C:\Program Files\Python311\python.exe";
[System.Serializable]
public struct ScriptReference
{
public string name;
public string directory;
}
[SerializeField] ScriptReference[] scriptRefs;
[SerializeField] int scriptIndex;
Process process;
private void Start()
{
StartProgram();
void StartProgram()
{
ScriptReference scriptRef = scriptRefs[scriptIndex];
// Innit start info
ProcessStartInfo startInfo = new();
startInfo.FileName = INTERPRETER_PATH;
// Assign directory
string location = $"{Application.dataPath}\\Python\\{scriptRef.directory}";
startInfo.Arguments = $"{location}\\{scriptRef.name}.py";
startInfo.WorkingDirectory = location;
// Settings
startInfo.UseShellExecute = false;
startInfo.RedirectStandardOutput = true;
startInfo.RedirectStandardError = true;
startInfo.CreateNoWindow = true;
// Innit process
process = new();
process.StartInfo = startInfo;
// Log Python console
process.OutputDataReceived +=
(sender, args) => UnityEngine.Debug.Log("Python Output: " + args.Data);
process.ErrorDataReceived +=
(sender, args) => UnityEngine.Debug.LogError("Python Error: " + args.Data);
// Start process
process.Start();
process.BeginOutputReadLine();
process.BeginErrorReadLine();
}
}
// Terminate process and cleanup
void EndProcess()
{
if (process != null && !process.HasExited)
{
process.Kill();
process.Dispose();
}
}
private void OnDisable() => EndProcess();
private void OnDestroy() => EndProcess();
private void OnApplicationQuit() => EndProcess();
}
Multithreading is used extensively to ensure responsiveness. It used for the TCP communication systems, since we need a separate thread to receive data to prevent blocking the main thread.
It also used to distribute all the different types of image processing into separate threads so we are not waiting for them all to be done before receiving it in Unity. This is mostly noticeable with the video capture, which now only needs to wait for human segmentation for it to be send to Unity.
During development of our next day test build, we were facing consistent startup crashes after adding a new game prototype to the platform.
This was due to all inactive games still being present in the scene. Since we needed to feature a lot of games, they needed to be loaded additively to save memory.
At the time, Addressables was only being used to dynamically load and unload Timelines for game intros and outros.
To resolve this, I had to familiarise myself with the existing system and expand on it to also dynamically load and unload games.
This not only got rid of the crashes, but also drastically improved performance, making it more stable for testing at nurseries.
public async Task LoadGameSceneAsync(string gameName, Transform parent)
{
string path = $"Assets/Scenes/Games/{gameName}/";
string file = $"{gameName}.prefab";
string fullPath = Path.Combine(path, file);
AsyncOperationHandle> locationHandle =
Addressables.LoadResourceLocationsAsync(fullPath);
await locationHandle.Task;
if (locationHandle.Status == AsyncOperationStatus.Succeeded)
{
AsyncOperationHandle handle =
Addressables.InstantiateAsync(fullPath, parent);
await handle.Task;
return handle.Result;
}
else
{
Debug.LogError("Failed to load addressable asset: " + file);
return null;
}
}
To improve on the Addressables Groups, using the Addressables Analyze tool I packed asset bundles appropriately:
Packing Together by Label - I used this for organisation subgroups within groups. In the right screenshot, you can see them being labelled by modules (M1, M2…) and lesson (numerals, letters…).
Packing Together - Used for grouping objects with dependencies that are typically loaded together. E.g. characters and their textures and models.
Packing Separately - Used for assets that are loaded separately like game prefabs and timelines,
To further improve on memory management, I also developed an ObjectPooler to reuse objects (characters, particle FX, hovering texts) between games. It uses addressables instead of prefabs so that we are never loading them until we need to.
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.AddressableAssets;
using UnityEngine.ResourceManagement.AsyncOperations;
public class ObjectPooler : MonoBehaviour
{
public interface IPoolObject { }
public class Pool where T : IPoolObject
{
public AssetReference reference;
public Stack pooled = new();
public Pool(AssetReference reference)
{
this.reference = reference;
}
}
Dictionary> pools = new();
[Serializable]
public struct AddressableReference
{
public string type;
public AssetReference reference;
}
[SerializeField] AddressableReference[] addressables;
public static ObjectPooler Instance { get; private set; }
private void Awake()
{
Instance = this;
foreach (var addressable in addressables)
{
pools.Add(Type.GetType(addressable.type), new(addressable.reference));
}
}
public async Task Get() where T : MonoBehaviour, IPoolObject
{
Pool pool = pools[typeof(T)];
T obj = await GetObject();
obj.gameObject.SetActive(true);
return obj;
async Task GetObject()
{
if (pool.pooled.Count > 0)
return (T)pool.pooled.Pop();
else
{
AsyncOperationHandle handle =
Addressables.InstantiateAsync(pool.reference);
await handle.Task;
return handle.Status == AsyncOperationStatus.Succeeded
? handle.Result.GetComponent()
: null;
}
}
}
public void Recycle(T obj) where T : MonoBehaviour, IPoolObject
{
obj.gameObject.SetActive(false);
obj.transform.SetParent(transform);
pools[typeof(T)].pooled.Push(obj);
}
}
Currently I am working on a component pooler.
This is a rhythm game prototype I'm working on. I had to work closely with the music department to ensure that the simulated piano keys were accurate.
The biggest task I was assigned with for this project was making a system that can generate musical notes from music. The first idea was to make an editing tool for sound engineers to manually add notes, which takes many musical factors into account (beat length, frequency, bpm…).
However, by using the DryWetMIDI .NET library, I implemented a much more effective solution which reads MIDI files to automatically generate the musical notes.
using Melanchall.DryWetMidi.Core;
using Melanchall.DryWetMidi.Interaction;
using System.IO;
using System.Linq;
using UnityEngine;
[CreateAssetMenu(fileName = "TrackInfo", menuName = "ScriptableObjects/TrackInfo", order = 1)]
public class TrackInfo : ScriptableObject
{
public AudioClip instrumental;
[SerializeField] string leadMidiFileStreamingAssetsPath;
public float noteBarHeightAdjustmentFactor = 0.00125f;
[ReadOnly] public double bpm;
[SerializeField, ReadOnly] float quantity, quality;
[ReadOnly] public float totalNumBars;
[SerializeField, ReadOnly] float secondsPerBar;
[ReadOnly] public float duration;
public class Note
{
public int Number { get; set; }
public MetricTimeSpan MetricStartTimeSpan { get; set; }
public MetricTimeSpan MetricEndTimeSpan { get; set; }
public long Length { get; set; }
}
public Note[] Notes { get; set; }
public void RefreshInfo()
{
BetterStreamingAssets.Initialize();
byte[] data = BetterStreamingAssets.ReadAllBytes(leadMidiFileStreamingAssetsPath);
ParseInfo(data);
}
private void Awake()
{
RefreshInfo();
}
void ParseInfo(byte[] midiData)
{
MemoryStream stream = new(midiData);
MidiFile midiFile = MidiFile.Read(stream);
TempoMap tempoMap = midiFile.GetTempoMap();
Notes = midiFile.GetNotes().Select(n => new Note
{
Number = n.NoteNumber,
MetricStartTimeSpan = TimeConverter.ConvertTo(n.Time, tempoMap),
MetricEndTimeSpan = n.EndTimeAs(tempoMap),
Length = n.Length
}).ToArray();
Tempo tempo = tempoMap.GetTempoAtTime(
new MetricTimeSpan(Notes[0].MetricStartTimeSpan.TotalMicroseconds));
bpm = tempo.BeatsPerMinute;
var firstTrackChunk = midiFile.GetTrackChunks().First();
var firstTimeSignatureEvent = firstTrackChunk
.Events
.OfType()
.FirstOrDefault();
quantity = firstTimeSignatureEvent.Numerator;
quality = firstTimeSignatureEvent.Denominator;
secondsPerBar = quantity * 60000f * 0.001f / (float)bpm;
var barBeatTimeOfLastEvent = midiFile.GetTimedEvents().Last().TimeAs(tempoMap);
totalNumBars = barBeatTimeOfLastEvent.Bars;
// If a bar is partially occupied, we need add it to the total count of bars
if (barBeatTimeOfLastEvent.Beats > 0 || barBeatTimeOfLastEvent.Ticks > 0)
totalNumBars = barBeatTimeOfLastEvent.Bars + 1;
duration = totalNumBars * secondsPerBar;
}
private void OnValidate()
{
if (File.Exists($"{Application.streamingAssetsPath}/{leadMidiFileStreamingAssetsPath}"))
{
RefreshInfo();
}
}
}