%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% File: getPhonemesFromBytes.m
%
% Helper that converts byte strings to phonemes.
%   Initial parsing from .cha file comes as a string
%   of unicode bytes. This file treats each 
%   byte with a 0 as the most significant digit
%   as a single character. Otherwise, we look for
%   a modifier (e.g. the long mark - [203 144] and
%   put it with the preceding character. If it's not
%   one, then we treat it as a two-byte character
%
% * Loosely based off of http://en.wikipedia.org/wiki/UTF-8
%
% Inputs:
%   * bytes: The bytes to convert
%
% Outputs:
%   * phonemes: The converted bytes
%
% Author: Doug Bemis
% Date: 11/27/11
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function phonemes = getPhonemesFromBytes(bytes)

% Global to control parsing options
global ignore_phoneme_modifiers;

% Default to use it
if isempty(ignore_phoneme_modifiers)
    ignore_phoneme_modifiers = 0;
end

% First, get the byte codes
ph_bytes = unicode2native(bytes);

% Go through and assign
p = 1;
phonemes = {};
while p <= length(ph_bytes)
    
    % Default assumption is that smaller bytes
    %   are a single phoneme.
    if ph_bytes(p) <= 128
        phonemes{end+1} = ph_bytes(p);
        
    
    else
        
        % If it's the length mark, I'm assuming it goes with the phoneme 
        %   before it. E.g. line 154 in Ben10.cha, phoneme 2
        if ph_bytes(p) == 203 && ph_bytes(p+1) == 144
            if ~ignore_phoneme_modifiers
                phonemes{end} = [phonemes{end} 203 144];
            end
            
        % If it's the nasal mark, I'm assuming it goes with the phoneme 
        %   before it. E.g. line 74 in Ben02.cha, phoneme 1
        elseif ph_bytes(p) == 204 && ph_bytes(p+1) == 131
            if ~ignore_phoneme_modifiers
                phonemes{end} = [phonemes{end} 204 131];
            end
            
        % I'm assuming it goes with the phoneme 
        %   before it. E.g. line 92 in Ben03.cha, phoneme 2
        elseif ph_bytes(p) == 204 && ph_bytes(p+1) == 165
            if ~ignore_phoneme_modifiers
                phonemes{end} = [phonemes{end} 204 165];
            end
        
        % Not sure exactly, but I think it goes with the phoneme 
        %   before it. E.g. line 144 in Ben02.cha, phoneme 2
        elseif ph_bytes(p) == 203 && ph_bytes(p+1) == 140
            if ~ignore_phoneme_modifiers
                phonemes{end} = [phonemes{end} 203 140];
            end
        
        % This seems to come from Korean phonology, I'm assuming it goes with the phoneme 
        %   before it. E.g. line 128 in Ben02.cha, phoneme 3
        elseif ph_bytes(p) == 204 && ph_bytes(p+1) == 154
            if ~ignore_phoneme_modifiers
                phonemes{end} = [phonemes{end} 204 154];
            end

        % If it's the superscript 'h', I'm assuming it goes with the phoneme 
        %   before it. E.g. line 132 in Ben08.cha, phoneme 2
        elseif ph_bytes(p) == 202 && ph_bytes(p+1) == 176
            if ~ignore_phoneme_modifiers
                phonemes{end} = [phonemes{end} 202 176];
            end

        % I think this is the b with a circle below it. 
        %   - It's {98 [204 165]}.
        elseif ph_bytes(p) == 204 && ph_bytes(p+1) == 165 && ...
                ~isempty(phonemes) && length(phonemes{end}) == 1 &&...
                phonemes{end}(1) == 98 
            if ~ignore_phoneme_modifiers
                phonemes{end} = [phonemes{end} 204 165];
            end

        % This the dipthong 'a', captial 'I' 
        %   - It's {98 [204 165]}.
        elseif ph_bytes(p) == 201 && ph_bytes(p+1) == 170 && ...
                ~isempty(phonemes) && length(phonemes{end}) == 1 &&...
                phonemes{end}(1) == 97 
            phonemes{end} = [phonemes{end} 201 170];
            
        % Otherwise, the default is that these two bytes are
        %   a character.
        else
            phonemes{end+1} = [ph_bytes(p) ph_bytes(p+1)];
        end
        p = p+1;
    end
    p = p+1;
end

