Matlab obtains the pitch frequency and formant information of the speech and marks it on the spectrogram

1.matlab finds the fundamental frequency F0 of speech

%% Extract fundamental frequency      
[f0,idx] = pitch(x,fs, 'WindowLength',wlen,'OverlapLength',inc);% Get the pitch frequency of the speech
fn1=size(f0,1);
result_f0=f0(find(f0<400)); % Filters the pitch frequencies that match the criteria
figure()
plot(result_f0);% plot the pitch frequency
title('Pitch frequency f0')
xlabel('frame')
ylabel('frequency/Hz')
axis([0 fn1 0 400])% sets the coordinate axis range

Drawing showing the results:

2. Matlab finds the voice formants (F1, F2, F3)

clc;
clear all;

%% Read voice files
filename='bluesky1.wav';
[x,fs]=audioread(filename);%Read wav file

%% Resonance peak extraction, Frmt is F1, F2, F3
wlen=512;inc=256;          
[voiceseg,vsl,SF_a,NF] = VAD(x,fs,wlen,inc);

%Extract formant features
Doption=0;
Frmt=Formant_ext2(x,wlen,inc,fs,SF_a,Doption);

F1=(Frmt(1,:));F1_min=min(F1);F1_max=max(F1);F1_mean=mean(F1);
F2=(Frmt(2,:));F2_min=min(F2);F2_max=max(F2);F2_mean=mean(F2);
F3=(Frmt(3,:));F3_min=min(F3);F3_max=max(F3);F3_mean=mean(F3);

The following functions are used

VAD.m
function [voiceseg,vsl,SF,NF] = VAD(y,fs,wlen,inc)
%UNTITLED2 A summary about this function is shown here
% Show detailed description here
N=length(y); % Get the signal length
time=(0:N-1)/fs; % Calculate time scale
IS=0.25; overlap=wlen-inc; % Set the length of the leading silent segment
NIS=fix((IS*fs-wlen)/inc +1); % Calculate the number of leading silent frames
frames=enframe(y,wlen,inc)';                 % framing
fn=size(frames,2); % number of frames
amp=sum(frames.^2); % Find the short-term average energy
zcr=zc2(frames,fn); % Calculate the short-term average zero-crossing rate  
ampm = multimidfilter(amp,5); % median filter smoothing
zcrm = multimidfilter(zcr,5);         
ampth=mean(ampm(1:NIS)); % Calculate the average value of the energy and zero-crossing rate of the initial non-talking segment 
zcrth=mean(zcrm(1:NIS));
amp2=1.1*ampth; amp1=1.3*ampth; % Set the threshold for energy and zero-crossing rate
zcr2=0.9*zcrth;

frameTime=frame2time(fn, wlen, inc, fs);% Calculate the time corresponding to each frame
[voiceseg,vsl,SF,NF]=vad_param2D_revr(amp,zcr,amp2,amp1,zcr2);% Endpoint detection

end

%% other functions
function [voiceseg,vsl,SF,NF]=vad_param2D_revr(dst1,dst2,T1,T2,T3,T4)

fn=length(dst1); % Get the frame number
maxsilence = 8; % initialization  
minlen  = 5;    
status  = 0;
count   = 0;
silence = 0;

%Start endpoint detection
xn=1;
for n=1:fn
   switch status
   case {0,1} % 0 = mute, 1 = may start
      if dst1(n) > T2 | ... % Be sure to enter the speech segment
         ( nargin==6 & dst2(n) < T4 ) 
         x1(xn) = max(n-count(xn)-1,1);
         status  = 2;
         silence(xn) = 0;
         count(xn)   = count(xn) + 1;
      elseif dst1(n) > T1 | ... % may be in speech segment
             dst2(n) < T3
         status = 1;
         count(xn)  = count(xn) + 1;
      else % mute state
         status  = 0;
         count(xn)   = 0;
         x1(xn)=0;
         x2(xn)=0;
      end
   case 2, % 2 = speech segment
      if dst1(n) > T1 | ... % stay in speech segment
         dst2(n) <  T3 
         count(xn) = count(xn) + 1;
      else % speech will end
         silence(xn) = silence(xn)+1;
         if silence(xn) < maxsilence % Silence is not long enough to end
            count(xn)  = count(xn) + 1;
         elseif count(xn) < minlen % The voice length is too short, which is considered as noise
            status  = 0;
            silence(xn) = 0;
            count(xn)   = 0;
         else % end of voice
            status  = 3;
            x2(xn)=x1(xn)+count(xn);
         end
      end
   case 3, % voice ends, prepare for next voice
        status  = 0;          
        xn=xn+1; 
        count(xn)   = 0;
        silence(xn)=0;
        x1(xn)=0;
        x2(xn)=0;
   end
end   

el=length(x1);
if x1(el)==0, el=el-1; end % Get the actual length of x1
if x2(el)==0 % If the last value of x2 is 0, set it to fn
%     fprintf('Error: Not find endding point!\n');
    x2(el)=fn;
end
SF=zeros(1,fn); % According to x1 and x2, assign values ​​to SF and NF
NF=ones(1,fn);
for i=1 : el
    SF(x1(i):x2(i))=1;
    NF(x1(i):x2(i))=0;
end
speechIndex=find(SF==1); % Calculate voiceseg
voiceseg=findSegment(speechIndex);
vsl=length(voiceseg);
end

%% other functions
function y=multimidfilter(x,m)
a=x;
for k=1 : m
    b=medfilt1(a, 5); 
    a=b;
end
y=b;
end

%% other functions
function zcr=zc2(y,fn)
if size(y,2)~=fn, y=y'; end
wlen=size(y,1); % Set frame length
zcr=zeros(1,fn); % initialization
delta=0.01; % Set a small threshold
for i=1:fn
    yn=y(:,i); % Get a frame
    for k=1 : wlen % center clipping processing
        if yn(k)>=delta
            ym(k)=yn(k)-delta;
        elseif yn(k)<-delta
            ym(k)=yn(k)+delta;
        else
            ym(k)=0;
        end
    end
    zcr(i)=sum(ym(1 end)<0);  % Get the zero crossing rate of the processed frame data
end
end

%% other functions
function soundSegment=findSegment(express)
if express(1)==0
    voicedIndex=find(express); % Find the position of 1 in express
else
    voicedIndex=express;
end

soundSegment = [];
k = 1;
soundSegment(k).begin = voicedIndex(1); % Set the starting position of the first group of voiced segments
for i=1:length(voicedIndex)-1,
	if voicedIndex(i+1)-voicedIndex(i)>1, % This group has ended the segment
		soundSegment(k).end = voicedIndex(i); % Set the end position of the segment in this group
		soundSegment(k+1).begin = voicedIndex(i+1);% Set the starting position of the next group of speech segments  
		k = k+1;
	end
end
soundSegment(k).end = voicedIndex(end); % The end position of the last group of voiced segments
% Calculate the length of each group of segments
for i=1 :k
    soundSegment(i).duration=soundSegment(i).end-soundSegment(i).begin+1;
end
end
Formant_ext2.m
function Fmap=Formant_ext2(x,wlen,inc,fs,SF,Doption)

y=filter([1 -.99],1,x); % pre-emphasis
xy=enframe(y,wlen,inc)';                              % framing
fn=size(xy,2); % Find the number of frames
Nx=length(y); % data length
time=(0:Nx-1)/fs; % time scale
frameTime=frame2time(fn,wlen,inc,fs); % The time scale corresponding to each frame
Msf=repmat(SF',1,3); % Expand SF into a 3×fn array
Ftmp_map=zeros(fn,3);
warning off
Fsamps = 256; % Set frequency domain length
Tsamps= fn; % Set the time domain length
ct = 0;

numiter = 10; % loop 10 times,
iv=2.^(10-10*exp(-linspace(2,10,numiter)/1.9)); % Calculate 10 numbers between 0 and 1024
for j=1:numiter
    i=iv(j);                                          
    iy=fix(length(y)/round(i)); % Calculate the number of frames
    [ft1] = seekfmts1(y,iy,fs,10); % Extract formant with known frame number
    ct = ct+1;
    ft2( ,ct) = interp1(linspace(1,length(y),iy)',...% interpolate ft1 data to Tsamps length
    Fsamps*ft1',linspace(1,length(y),Tsamps)')';
end
ft3 = squeeze(nanmean(permute(ft2,[3 2 1]))); % rearrange and average processing
tmap = repmat([1:Tsamps]',1,3);
Fmap=ones(size(tmap))*nan; % initialization
idx = find(~isnan(sum(ft3,2))); % Find non-nan positions
fmap = ft3(idx,:); % stores non-nan data

[b,a] = butter(9,0.1); % Design a low-pass filter
fmap1 = round(filtfilt(b,a,fmap)); % low-pass filtering
fmap2 = (fs/2)*(fmap1/256); % restore to actual frequency
Ftmp_map(idx,:)=fmap2; % output data

Fmap=Ftmp_map';
findex=find(Fmap==nan);
Fmap(findex)=0;
% plotting
if Doption
figure(99)
nfft=512;
d=stftms(y,wlen,nfft,inc);
W2=1+nfft/2;
n2=1:W2;
freq=(n2-1)*fs/nfft;
Fmap1=Msf.*Ftmp_map; % Get only the data with segments
findex=find(Fmap1==0); % If any value is 0, set it to nan
Fmapd=Fmap1;
Fmapd(findex)=nan;
imagesc(frameTime,freq,abs(d(n2,:)));  axis xy
m = 64; LightYellow = [0.6 0.6 0.6];
MidRed = [0 0 0]; Black = [0.5 0.7 1];
Colors = [LightYellow; MidRed; Black];
colormap(SpecColorMap(m,Colors)); hold on
plot(frameTime,Fmapd,'w'); % Superimpose the formant frequency curve
title('Mark the formant frequencies on the spectrogram');
xlabel('time/s'); ylabel('frequency/Hz')
end
end
%% other functions
function [fmt] = seekfmts1(sig,Nt,fs,Nlpc)
if nargin<4, Nlpc = round(fs/1000)+2; end;
ls=length(sig); % data length
Nwin = floor(ls/Nt); % frame length

for m=1:Nt,
    lpcsig = sig((Nwin*(m-1)+1):min([(Nwin*m) ls]));% Get a frame of signal
    
    if ~isempty(lpcsig),
        a = lpc(lpcsig,Nlpc); % Calculate LPC coefficient
        const=fs/(2*pi); % constant  
        rts=roots(a);                             % root
        k=1; % initialization
        yf = [];
        bandw=[];
        for i=1:length(a)-1                     
            re=real(rts(i)); % take the real part of the root
            im=imag(rts(i)); % Take the imaginary part of the root
            formn=const*atan2(im,re); % Calculate the formant frequency
            bw=-2*const*log(abs(rts(i))); % calculation bandwidth
    
            if formn> 150&bw < 700&formn <fs/ 2%, formant and bandwidth can be obtained
            yf(k)=formn;
            bandw(k)=bw;
            k=k+1;
            end
        end

        [y, ind]=sort(yf); % sort
        bw=bandw(ind);
        F = [NaN NaN NaN]; % initialization
        F(1:min(3,length(y))) = y(1:min(3,length(y))); % output up to three
        F = F(:); % output in columns
        fmt(:,m)=F/(fs/2); % normalized frequency  
    end;
end;
end
frame2time.m
function frameTime=frame2time(frameNum,framelen,inc,fs)
% Calculate the time corresponding to each frame after dividing the frame
frameTime=(((1:frameNum)-1)*inc+framelen/2)/fs;
end
enframe.m
function f=enframe(x,win,inc)

nx=length(x(:)); % Get the data length
nwin=length(win); % Get the window length
if (nwin == 1) % determines whether the window length is 1. If it is 1, it means there is no window function
   len = win; % yes, frame length = win
else
   len = nwin; % No, frame length = window length
end
if (nargin < 3) % If there are only two arguments, set frame inc= frame length
   inc = len;
end
nf = fix((nx-len+inc)/inc); % Calculate the number of frames
f=zeros(nf,len); % initialization
indf= inc*(0:(nf-1)).'; % Set the displacement position of each frame in x
inds = (1:len); % Each frame of data corresponds to 1:len
f(:) = x(indf( )); % frame the data
if (nwin > 1) % If the parameter includes a window function, multiply each frame by the window function
    w = win(:)'; % Convert win into row data
    f = f .* w(ones(nf,1),:); % window function
end
end

Draw the formants F1, F2, F3

% Draw formants
figure()
plot(F1,'r','LineWidth',0.3)
hold on
plot(F2,'m','LineWidth',0.3)
plot(F3,'k','LineWidth',0.3)
fn=size(F1,2);% frame number
%Set x-axis
frame2time=linspace(0,fn,9);
times=0:0.5:4;
xticks(frame2time);
xticklabels(times);
xlim([0 fn]);%Set x-axis范围
ylim([0 fs/2]);%Set x-axis范围

%Set legend font and size
h=legend('F1','F2','F3');
set(h,'FontName','Times New Roman','FontSize',11,'FontWeight','normal')
xlabel('time/s');
ylabel('frequency/Hz');
title('Resonance Peak Label');

Formant display display

3.matlab draws speech spectrogram

figure()
spectrogram(x,wlen,inc,nfft,fs,'yaxis');

Drawing display:

4. Draw the resonance peaks onto the spectrogram and display them

figure()
wlen=512;
inc=256;
nfft=wlen;

spec_data=spectrogram(x,wlen,inc,nfft,fs,'yaxis');
spec_data=abs(spec_data).*abs(spec_data); % molding
spec_data=10*log10(spec_data);% take the logarithm
dim2freq=linspace(0,fs/2,size(spec_data,1));
xlabels=0:1:size(spec_data,2);
imagesc(xlabels,dim2freq,spec_data);% draw spectrogram

colorbar();%display colorbar()
axis xy;%reverse the order
hold on
plot(F1,'r','LineWidth',0.3)
plot(F2,'m','LineWidth',0.3)
plot(F3,'k','LineWidth',0.3)

%Set x-axis
frame2time=linspace(0,size(spec_data,2),9);
times=0:0.5:4;
xticks(frame2time);
xticklabels(times);

%Set legend font and size
h=legend('F1','F2','F3');
set(h,'FontName','Times New Roman','FontSize',11,'FontWeight','normal')
xlabel('time/s');
ylabel('frequency/Hz');
title('Annotation of formant peaks to spectrogram');

Plot the result:

5.Description

  1. In experimentpitchfunction sumspectrogramThe functions come with matlab. If the experimenter cannot find these two functions on his computer, he can search for the function online and copy it to the file path. The author’s matlab version is matlab 2019a.
  2. The .m file represented in the center requires readers to copy all its contents and name it as the corresponding matlab function file.
  3. Due to limited time, only the code implementation part is shown. If there is time later, I will continue to add the content.

Related Posts

Sorting function in Python–sorted() function

python one-way circular linked list

Six-axis robotic arm DIY (4) Mechanical model reconstruction and DH method modeling

Python implementation of SSL communication using OpenSSL

Passing of flask parameters

Detailed explanation of the usage of fspecial function in matlab for image processing

What are the common classifications of software testing tests?

WAVE audio format and conversion code

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>

*