%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% File: parseCHAT.m
%
% Parses a .cha file. 
%
% Inputs:
%   * file: The .cha file to parse.
%   * show_output: 1 to see the output on 
%       the command line (Defaults to 0).
%
% Outputs: 
%   * babbles: The parsed babbles.
%   * good_parses: The full parses.
%   * non_babbles: A record of the non babbles
%       in the chat.
%   * gender / years / months / days / language:
%       Info about the child.
%
% Usage: [babbles good_parses non_babbles gender years months days language] = parseCHAT('Ben03.cha')
%
% Author: Doug Bemis
% Date: 11/27/11
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [babbles good_parses non_babbles gender years months days language] = parseCHAT(file, show_output)

% Default to not show
if nargin < 2
    show_output = 0;
end

% Open up the file
fid = fopen(file);

% Get the headers
% TODO: Use these....
missing_data = 0;
enc_line = fgetl(fid); 
if ~strcmp(enc_line,'@UTF8')
    error('Bad encoding line. Exiting...');
end
begin_line = fgetl(fid);    %#ok<NASGU> % Should be @Begin
if ~strcmp(begin_line,'@Begin')
    error('Bad begin line. Exiting...');
end
lang_line = fgetl(fid);  
[tag lang] = strtok(lang_line);
if ~strcmp(tag,'@Languages:')
    error('Bad language line. Exiting...');
end
part_line = fgetl(fid);    %#ok<NASGU> % Should be @Participants: ...
[tag part] = strtok(part_line);
if ~strcmp(tag,'@Participants:')
    error('Bad participants line. Exiting...');
end
id_line = fgetl(fid);    %#ok<NASGU> % Should be @ID: ...
[tag id] = strtok(id_line);
if ~strcmp(tag,'@ID:')
    error('Bad ID line. Exiting...');
end

% Parse our data from this
[language rest_id] = strtok(id,'|');
language = strtrim(language);
[corpus rest_id] = strtok(rest_id,'|');
[code rest_id] = strtok(rest_id,'|');
[age rest_id] = strtok(rest_id,'|');
[years rest_age] = strtok(age,';');
[months rest_age] = strtok(rest_age(2:end),'.');
days = rest_age(2:end);
[gender rest_id] = strtok(rest_id,'|');
if ~strcmp(gender,'male') && ~strcmp(gender,'female')
    gender = 'unknown';
end

media_line = fgetl(fid);    %#ok<NASGU> % Should be @Media: ...
[tag media] = strtok(media_line);
if ~strcmp(tag,'@Media:')
    % This line is optional...
    if strcmp(tag,'@Date:')
        date = media;
        media = '';
    
    % Can be missing the date line too...
    elseif strcmp(tag,'*CHI:')
        date = '';
        line = media_line;
        missing_data = 1;
    else
        error('Bad media line. Exiting...');
    end
end

% Only if we didn't get it
if ~isempty(media) && ~missing_data
    date_line = fgetl(fid);    %#ok<NASGU> % Should be @Date: ...
    [tag date] = strtok(date_line);
    if ~strcmp(tag,'@Date:')
        error('Bad date line. Exiting...');
    end
end

% Now, parse the babbling
l_ctr = 6;   % This is the line number in the .cha CLAN browser
babbles = {};   % Hold the babbling data
non_babbles = {};
while 1
    
    % Get the next line
    if ~missing_data        % Don't advance if there was no data...
        line = fgetl(fid);
    end
    if strcmp(line,'@End')
        break;
    end
    if ~ischar(line)
        error('Unexected end of file. Exiting...');
    end
    
    % For now, we're expecting a strict format of
    %   *CHI: ...
    %   %pho: ...
    [speaker speech] = strtok(line);
    
    % Might have a comment after the last one
    if strcmp(speaker,'%com:') || strcmp(speaker,'@Comment:')
        line = fgetl(fid);
        [speaker speech] = strtok(line);
        l_ctr = l_ctr+1;
    end
    if ~strcmp(speaker,'*CHI:')
        error('Unknown speaker. Exiting...');
    end 
    l_ctr = l_ctr+1;
    line = fgetl(fid);
    [pho_tier phonemes] = strtok(line);
    
    % Might be missing data
    missing_data = 0;
    if strcmp(pho_tier,'*CHI:') || strcmp(pho_tier,'@End')
        missing_data = 1;
        l_ctr = l_ctr-1;
        phonemes = '';
    elseif ~strcmp(pho_tier,'%pho:')
        error('Unknown tier. Exiting...');
    end
    l_ctr = l_ctr+1;
    
    % And we only want babbling, which will be
    %   defined now as first speech = xxx
    [first_sp rest_sp] = strtok(speech); %#ok<NASGU>
    [second_sp rest_sp] = strtok(rest_sp); %#ok<NASGU>
    if strcmp(first_sp,'xxx') && strcmp(second_sp,'.')
        % Take off the space
        if ~isempty(phonemes)
            if phonemes(1) == 9
                phonemes = phonemes(2:end);
            else
                error('Badly formed phonemes. Exiting...');
            end
        end
        babbles{end+1} = {l_ctr speech phonemes syllabifyBabble(phonemes, l_ctr)}; %#ok<AGROW>
    else
        non_babbles{end+1} = {l_ctr speech phonemes}; %#ok<AGROW>
    end
end
fclose(fid);

% And assess
good_parses = assessBabbles(babbles,show_output);

