%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% File: analyze%.m
%
% Turns an initial parse into useful 
%   counts of syllables and transitions
%   and where they occured for all of
%   the chats for a child.
%
% Inputs:
%   * database: The database folder the child is a part of.
%   * child: The child's name
%   * results: The initial tabulation, if already done.
%       - If empty will redo.
%   * babbles: The initial parses of the files.
%       - If empty will redo.
%   * files: The files the babbles came from.
%       - Will recalculate with the babbles.
%   * should_print: 1 to show output to the command line.
%       - Defaults to 0.
%
% Outputs: 
%   * results: The results of the analysis.
%   * babbles: The parsed babbles.
%   * files: The files for the babbles.
%
% Usage: [results babbles files] = analyzeChild('English-Davis_CHAT', 'Ben');
%
% Author: Doug Bemis
% Date: 11/27/11
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [results babbles files] = analyzeChild(database, child, save_file, should_print, results, ...
    babbles, files, good_parses, non_babbles)

% Global to control parsing options
global treat_doubled_as_edge;

% Default to treat doubled as edge
if isempty(treat_doubled_as_edge)
    treat_doubled_as_edge = 1;
end
if ~exist('save_file','var')
    save_file = '';
end
if ~exist('should_print','var')
    should_print = 0;
end
if ~exist('results','var')
    results = '';
end
if ~exist('babbles','var')
    babbles = '';
end
if ~exist('files','var')
    files = '';
end
if ~exist('good_parses','var')
    good_parses = '';
end
if ~exist('non_babbles','var')
    non_babbles = '';
end

% Default to reparse
if isempty(babbles)

    % Get all the babbles
    chats = dir([database '/' child]);

    % And parse them
    files = {};
    babbles = {};
    good_parses = [];
    non_babbles = {};
    results = '';
    disp('Parsing Chats...');
    tic;
    for c = 1:length(chats)
        if length(chats(c).name) > 3 && strcmp(chats(c).name(end-3:end),'.cha')
            files{end+1} = chats(c).name;
            [babbles{end+1} good_parses(end+1) non_babbles{end+1}] = parseCHAT([database '/' child '/' chats(c).name]);
        end
    end
    disp(['    Took ' num2str(toc) ' seconds...']);
else
    disp('Using given data.');
end


% Only tabulate if we haven't yet
if isempty(results)

    % Note, both results.syllables and results.transitions have two cells
    %   * The first is the syllable number, or two for transitions
    %   * The second has five fields:
    %       - The babble number (i.e. file)
    %       - The utterance number in the babble
    %       - The line number of the utterance
    %       - The syllable in the utterance
    %       - The number of syllables in the utterance
    disp('Tabulating parses...');
    tic;
    results.files = {};
    results.syllables = {};
    results.transitions = {};
    results.perc_parsed = [];
    results.perc_babbling = [];
    for f = 1:length(files)
        results.files{end+1} = files{f};
        results.perc_parsed(end+1) = good_parses(f) / length(babbles{f});
        results.perc_babbling(end+1) = length(babbles{f}) / (length(babbles{f}) + length(non_babbles{f}));
        results = tabulateBabble(babbles{f}, f, results);
    end
    disp(['    Took ' num2str(toc) ' seconds...']);
else
    disp('Using given tabulation.');
end

% Count them up
disp('Counting occurences...');
tic;
[results.syl_freq results.syl_pos_freq] = countOccurrences(results.syllables, results.files);
[results.tr_freq results.tr_pos_freq] = countOccurrences(results.transitions, results.files);

% Count up the number of syllables and transitions in each session
session_syl = countOccurencesPerFile(results.syl_freq, results.files);
session_tr = countOccurencesPerFile(results.tr_freq, results.files);

% Get the number of different transistions per syllable per file
results.syl_tr = zeros([size(results.syllables), length(results.files)]);

% TODO: Figure out how to use sparse best...
tr_ind = find(results.tr_freq > 0);
for i = tr_ind

    % Add for both syllables
    [x y z] = ind2sub(size(results.tr_freq),i);
    results.syl_tr(1,x,z) = results.syl_tr(1,x,z) + 1;
    results.syl_tr(1,y,z) = results.syl_tr(1,y,z) + 1;
end

% Get the average length of a babble in syllables for each session
results.utt_len = zeros(1,length(results.files));
for f = 1:length(results.files)
    num_parses = 0;
    num_babbles = length(babbles{f});
    for b = 1:num_babbles
        num_syl = length(babbles{f}{b}{4}{1}{1});
        if num_syl > 0
            results.utt_len(f) = results.utt_len(f) + num_syl;
            num_parses = num_parses+1;
        end
    end
    results.utt_len(f) = results.utt_len(f) / num_parses;
end


% Get the mean and standard devation of occurrences
for s = 1:size(results.syl_freq,2)
     occ = [];
     for i = 1:length(results.files)
         occ(end+1:end+results.syl_freq(1,s,i)) = ones(results.syl_freq(1,s,i),1)*i;
     end
     results.syl_mean_occ(s) = mean(occ);
     results.syl_std_occ(s) = std(occ);
end

results.tr_mean_occ = zeros(size(results.transitions));
results.tr_std_occ = zeros(size(results.transitions));
[x y z] = ind2sub(size(results.tr_freq),tr_ind);
for i = 1:length(x)
    if results.tr_mean_occ(x(i),y(i)) == 0
        occ = [];
        for f = 1:length(results.files)
            occ(end+1:end+results.tr_freq(x(i),y(i),f)) = ones(results.tr_freq(x(i),y(i),f),1)*f;      
        end
        results.tr_mean_occ(x(i),y(i)) = mean(occ);
        res.tr_std_occ(x(i),y(i)) = std(occ);
    end
end
disp(['    Took ' num2str(toc) ' seconds...']);

% Print the results
if should_print
    tic;
    fid = fopen([database '_' child '_Analysis.txt'],'w');

    % The basics
    num_tr = length(find(results.tr_mean_occ) > 0);
    fprintf(fid, ['Files: ' num2str(length(results.files)) '\n']);
    fprintf(fid, ['Syllables: ' num2str(length(results.syllables)) '\n']);
    fprintf(fid, ['Transitions: ' num2str(num_tr) '\n\n']);

    % The file counts
    fprintf(fid,'File\tNum_Syllables\tNum_transitions\tUtt_Len\n');
    for f = 1:length(results.files)
        fprintf(fid,[results.files{f} '\t' num2str(session_syl(f)) '\t' num2str(session_tr(f))...
            '\t' num2str(results.utt_len(f)) '\n']);
    end

    % Print out the weighted frequency averages
    fprintf(fid,'\nSyllable\tMean occurrence\tStandard Deviation\n');
    for s = 1:length(results.syl_freq)
        if isempty(results.syllables{1,s})
            continue;
        end
        fprintf(fid,[num2str(s) '\t' num2str(results.syl_mean_occ(1,s)) '\t' num2str(results.syl_std_occ(1,s)) '\n']);
    end

    fprintf(fid,'\nTR_1\tTR_2\tMean occurrence\tMean occurrence of Reverse\tStandard Deviation\n');
    for x = 1:size(results.tr_freq,1)
        for y = 1:size(results.tr_freq,2)
            if isempty(results.transitions{x,y})
                continue;
            end
            
            if size(results.tr_mean_occ,1) >= y && size(results.tr_mean_occ,2) >= x && results.tr_mean_occ(y,x) > 0
                w_freq_R = results.tr_mean_occ(y,x);
            else
                w_freq_R = '--';
            end 
            fprintf(fid,[num2str(x) '\t' num2str(y) '\t' num2str(results.tr_mean_occ(x,y)) ...
                '\t' num2str(w_freq_R) '\t' num2str(results.tr_std_occ(x,y)) '\n']);
        end
    end

    % Print out...
    printValues(fid, 'Syllable', 'Frequency', 'Total', results.syl_freq, results.files);
    printValues(fid, 'Tr_1\tTr2', 'Frequency', 'Total', results.tr_freq, results.files);
%     printValues(fid, 'Syllable', 'Syl_Edge_Perc', 'Average', ...
%         (1-(results.syl_mid_freq ./ results.syl_freq)), results.files);
%     printValues(fid, 'Tr_1\tTr2', 'Tr_Edge_Perc', 'Average', ...
%         (1-(results.tr_mid_freq ./ results.tr_freq)), results.files);
%     printValues(fid, 'Syllable', 'Syl_Double_Freq', 'Average', results.syl_double_freq, results.files);
%     printValues(fid, 'Tr_1\tTr2', 'Tr_Double_Freq', 'Average', results.tr_double_freq, results.files);
%    printValues(fid, 'Syllable', 'Num_Transistions', 'Average', results.syl_tr, results.files);
%     printValues(fid, 'Syllable', 'Min_Placement', 'Average', results.syl_min_place, results.files);
%    printValues(fid, 'Syllable', 'Fr_Placement', 'Average', results.syl_fr_place, results.files);
%    printValues(fid, 'Syllable', 'End_Placement', 'Average', results.syl_end_place, results.files);
%     printValues(fid, 'Tr_1\tTR_2', 'Min_Placement', 'Average', results.tr_min_place, results.files);
%    printValues(fid, 'Tr_1\tTR_2', 'Fr_Placement', 'Average', results.tr_fr_place, results.files);
%    printValues(fid, 'Tr_1\tTR_2', 'End_Placement', 'Average', results.tr_end_place, results.files);
    disp(['    Took ' num2str(toc) ' seconds...']);    
end

% And save the results of the analysis 
if ~isempty(save_file)
    save(save_file,'results','babbles','files');
end
disp('Done.');


% Helper
function [freq pos_freq] = countOccurrences(list, files)

% This counts the frequencies of syllable placements
%   for each file, in terms of the transition types
%   that surround it. These are indexed in the 
%   last two dimensions, one for the transition
%   type on the left, one for on the right.
% Mapping of indices to types:
%   1 - No transition
%   2 - Transition to same syllable
%   3 - Transition to different syllable
pos_freq = zeros([size(list), length(files), 3, 3]);

% To avoid adding up all the time, also keep a 
%   frequency variable.
freq = zeros([size(list), length(files)]);

% And count
for x = 1:size(list,1)
    for y = 1:size(list,2)

        % Only if we have one
        if isempty(list{x,y})
            continue;
        end
        
        % Add to the count for that utterance
        for u = 1:length(list{x,y})

            f_num = list{x,y}{u}(1);
            freq(x,y,f_num) = freq(x,y,f_num)+1;  

            % Get the transition type on the left...
            
            % See if it's at the beginning
            if list{x,y}{u}(4) == 1
                l_t = 1;
                
            % See if it's the same. This will be the case if the previous
            %   entry is the same and it occurred in the syllable before
            elseif u > 1 && sum(list{x,y}{u}(1:3) == list{x,y}{u-1}(1:3)) == 3 && ...
                    list{x,y}{u-1}(4) == list{x,y}{u}(4)-1
                l_t = 2;
                
            % Otherwise it's something else
            else
                l_t = 3;
            end

            % Get the transition type on the right
            
            % See if it's at the end
            if list{x,y}{u}(4) == list{x,y}{u}(5)
                r_t = 1;
                
            % See if it's the same. This will be the case if the next
            %   entry is the same and it occurred in the syllable before
            elseif u < length(list{x,y}) && sum(list{x,y}{u}(1:3) == list{x,y}{u+1}(1:3)) == 3 && ...
                    list{x,y}{u+1}(4) == list{x,y}{u}(4)+1
                r_t = 2;
                
            % Otherwise it's something else
            else
                r_t = 3;
            end
            
            % And record
            pos_freq(x,y,f_num,l_t,r_t) = pos_freq(x,y,f_num,l_t,r_t) + 1;
        end
    end
end

% Helper
function count = countOccurencesPerFile(freq, files)

count = zeros(length(files),1);
for x = 1:size(freq,1)
    for y = 1:size(freq,2)
        for f = 1:length(files)
            if freq(x,y,f) > 0
                count(f) = count(f)+1;
            end
        end
    end
end

% Helper
function printValues(fid, label, type, stat, values, files)

disp(['Printing ' label ' ' type '...']);
fprintf(fid,['\n' label '\t' stat '_' type]);
for f = 1:length(files)
    fprintf(fid,['\t' files{f}]);
end
fprintf(fid,'\n');
for x = 1:size(values,1)
    for y = 1:size(values,2)

        % Make sure we have it
        if ~any(values(x,y,:) > 0)
            continue;
        end
        
        % Print the label
        if size(values,1) > 1
            fprintf(fid,[num2str(x) '\t']);
        end
        fprintf(fid,[num2str(y) '\t']);
        
        % Print the stat
        if strcmp(stat, 'Total')
            fprintf(fid,num2str(sum(values(x,y,:))));
        elseif strcmp(stat, 'Average')
            good = values(x,y,:) > 0;
            fprintf(fid,num2str(mean(values(x,y,good))));
        else
            error('Unknown stat requested.');
        end

        % Print the values
        for f = 1:length(files)
            fprintf(fid,['\t' num2str(values(x,y,f))]);
        end
        fprintf(fid,'\n');
    end
end
