I would like to write a bash script, using awk, to determine how many lines start with each character.
Sample input: ./script.sh txt1 txt2 text1 text2 (filenames could be random too)
txt1
asdaga
dasdag
asdasdag
awqr
zvvbrh
tqetvh
xbrrte
txt2
npoajd
pojta
pskdna
nghir
asdt
bmkgjk
Sample output:
--- txt1 ---
a : 3
b : 0
c : 0
...
z : 1
...
ascii255 : 0
--- txt2 ---
a : 1
b : 1
...
p : 2
...
--- text3 ---
etc
where [character] : [number of rows that start with that character] is the correct format.
After printing every file one by one, I would also like to print a collective result, that follows the same format, so every charactercount will show the sum of each textfile's characters, so in the given example (for only txt1 and txt2) the output would be:
a : 4
b : 1
...
(epl: txt1 contains 3 lines that start with a, txt2 contains 1 line that start with a, so the total will be 3 1 = 4)
Here is the code that I wrote, but I am stuck, it doesn't work, I am confused with the awk syntax:
#!/bin/bash
awk '
{split($0,arr)
n=length(arr)
for(i=1;i<=255;i ){
char[i]=0;
}
for(i=1;i<=n;i ){
actchar=substr(1,1,1);
char[actchar] ;
printf("--- %s ---\n",FILENAME);
for(j=1;j<=255;j ){
prinf("%c : %s\n",j,char[j]);
}
}
'
CodePudding user response:
This solution safely skips multi-byte characters if that's the first character; works the same for gawk byte-mode or unicode-mode :
% pv -q < "${m3t}" | mawk2 '
function printreport(__,___,_,____) {
if (___=="") {
return ___
}
printf(" ======= %s ================\n",___)
for (_=2^3*4;_<(4^3*2-1);_ ) {
printf(" [ %s ] = %9.f | .f \n",
___=sprintf("%c",_),
__[___], ____ =__[___])
}
printf(" =====================================\n"\
" ASCII 32(spc)-126(~) sum = .f\n\n",____)
return split("",__)
}
BEGIN { FS = substr("^$",\
_ = !split(___,__))
} FNR== _ {
___=substr(FILENAME != "-" ? FILENAME \
: " /dev/fd/0 :: STDIN ", !-printreport(__,___))
} {
__[substr($!_,_,_)]
} END {
printreport(__,___) } ' "${m3l}" "${m3m}" '/dev/stdin' | ecp;
======= .../m23lyricsFLT_05.txt ================
[ ] = 7 | 7
[ ! ] = 0 | 7
[ " ] = 51 | 58
[ # ] = 62 | 120
[ $ ] = 3 | 123
[ % ] = 0 | 123
[ & ] = 0 | 123
[ ' ] = 443 | 566
[ ( ] = 1766 | 2332
[ ) ] = 2 | 2334
[ * ] = 944 | 3278
[ ] = 1 | 3279
[ , ] = 1 | 3280
[ - ] = 75 | 3355
[ . ] = 22 | 3377
[ / ] = 58 | 3435
[ 0 ] = 158142 | 161577
[ 1 ] = 2090 | 163667
[ 2 ] = 131 | 163798
[ 3 ] = 57 | 163855
[ 4 ] = 31 | 163886
[ 5 ] = 53 | 163939
[ 6 ] = 16 | 163955
[ 7 ] = 38 | 163993
[ 8 ] = 11 | 164004
[ 9 ] = 22 | 164026
[ : ] = 6 | 164032
[ ; ] = 1 | 164033
[ < ] = 158 | 164191
[ = ] = 0 | 164191
[ > ] = 3 | 164194
[ ? ] = 18 | 164212
[ @ ] = 8 | 164220
[ A ] = 1552 | 165772
[ B ] = 1407 | 167179
[ C ] = 1210 | 168389
[ D ] = 1186 | 169575
[ E ] = 570 | 170145
[ F ] = 568 | 170713
[ G ] = 796 | 171509
[ H ] = 2211 | 173720
[ I ] = 6825 | 180545
[ J ] = 397 | 180942
[ K ] = 160 | 181102
[ L ] = 1516 | 182618
[ M ] = 941 | 183559
[ N ] = 737 | 184296
[ O ] = 1640 | 185936
[ P ] = 460 | 186396
[ Q ] = 40 | 186436
[ R ] = 925 | 187361
[ S ] = 2286 | 189647
[ T ] = 2119 | 191766
[ U ] = 348 | 192114
[ V ] = 943 | 193057
[ W ] = 2353 | 195410
[ X ] = 14 | 195424
[ Y ] = 2941 | 198365
[ Z ] = 30 | 198395
[ [ ] = 3669 | 202064
[ \ ] = 0 | 202064
[ ] ] = 0 | 202064
[ ^ ] = 0 | 202064
[ _ ] = 0 | 202064
[ ` ] = 0 | 202064
[ a ] = 291 | 202355
[ b ] = 251 | 202606
[ c ] = 246 | 202852
[ d ] = 127 | 202979
[ e ] = 88 | 203067
[ f ] = 74 | 203141
[ g ] = 108 | 203249
[ h ] = 403 | 203652
[ i ] = 572 | 204224
[ j ] = 62 | 204286
[ k ] = 48 | 204334
[ l ] = 204 | 204538
[ m ] = 174 | 204712
[ n ] = 135 | 204847
[ o ] = 363 | 205210
[ p ] = 77 | 205287
[ q ] = 6 | 205293
[ r ] = 292 | 205585
[ s ] = 376 | 205961
[ t ] = 288 | 206249
[ u ] = 98 | 206347
[ v ] = 319 | 206666
[ w ] = 404 | 207070
[ x ] = 11 | 207081
[ y ] = 522 | 207603
[ z ] = 22 | 207625
[ { ] = 4 | 207629
[ | ] = 0 | 207629
[ } ] = 0 | 207629
[ ~ ] = 3 | 207632
=====================================
ASCII 32(spc)-126(~) sum = 207632
======= .../m3vid_genie26.txt ================
[ ] = 0 | 0
[ ! ] = 1 | 1
[ " ] = 4 | 5
[ # ] = 106 | 111
[ $ ] = 8 | 119
[ % ] = 1 | 120
[ & ] = 6 | 126
[ ' ] = 294 | 420
[ ( ] = 188 | 608
[ ) ] = 0 | 608
[ * ] = 5 | 613
[ ] = 2 | 615
[ , ] = 0 | 615
[ - ] = 4 | 619
[ . ] = 50 | 669
[ / ] = 0 | 669
[ 0 ] = 86 | 755
[ 1 ] = 521 | 1276
[ 2 ] = 457 | 1733
[ 3 ] = 198 | 1931
[ 4 ] = 178 | 2109
[ 5 ] = 150 | 2259
[ 6 ] = 86 | 2345
[ 7 ] = 126 | 2471
[ 8 ] = 91 | 2562
[ 9 ] = 123 | 2685
[ : ] = 0 | 2685
[ ; ] = 0 | 2685
[ < ] = 46 | 2731
[ = ] = 0 | 2731
[ > ] = 3 | 2734
[ ? ] = 6 | 2740
[ @ ] = 0 | 2740
[ A ] = 3190 | 5930
[ B ] = 4078 | 10008
[ C ] = 3279 | 13287
[ D ] = 3330 | 16617
[ E ] = 1474 | 18091
[ F ] = 2745 | 20836
[ G ] = 2337 | 23173
[ H ] = 3139 | 26312
[ I ] = 5411 | 31723
[ J ] = 981 | 32704
[ K ] = 893 | 33597
[ L ] = 4264 | 37861
[ M ] = 4134 | 41995
[ N ] = 1972 | 43967
[ O ] = 1996 | 45963
[ P ] = 2409 | 48372
[ Q ] = 94 | 48466
[ R ] = 2262 | 50728
[ S ] = 6701 | 57429
[ T ] = 5794 | 63223
[ U ] = 717 | 63940
[ V ] = 554 | 64494
[ W ] = 4119 | 68613
[ X ] = 106 | 68719
[ Y ] = 1644 | 70363
[ Z ] = 145 | 70508
[ [ ] = 20079 | 90587
[ \ ] = 0 | 90587
[ ] ] = 0 | 90587
[ ^ ] = 0 | 90587
[ _ ] = 0 | 90587
[ ` ] = 0 | 90587
[ a ] = 117 | 90704
[ b ] = 132 | 90836
[ c ] = 128 | 90964
[ d ] = 83 | 91047
[ e ] = 60 | 91107
[ f ] = 114 | 91221
[ g ] = 104 | 91325
[ h ] = 103 | 91428
[ i ] = 143 | 91571
[ j ] = 26 | 91597
[ k ] = 21 | 91618
[ l ] = 117 | 91735
[ m ] = 145 | 91880
[ n ] = 72 | 91952
[ o ] = 67 | 92019
[ p ] = 95 | 92114
[ q ] = 4 | 92118
[ r ] = 68 | 92186
[ s ] = 222 | 92408
[ t ] = 149 | 92557
[ u ] = 16 | 92573
[ v ] = 22 | 92595
[ w ] = 167 | 92762
[ x ] = 2 | 92764
[ y ] = 47 | 92811
[ z ] = 4 | 92815
[ { ] = 0 | 92815
[ | ] = 0 | 92815
[ } ] = 0 | 92815
[ ~ ] = 3 | 92818
=====================================
ASCII 32(spc)-126(~) sum = 92818
======= /dev/stdin ================
[ ] = 0 | 0
[ ! ] = 5 | 5
[ " ] = 7062 | 7067
[ # ] = 3889 | 10956
[ $ ] = 308 | 11264
[ % ] = 165 | 11429
[ & ] = 3210 | 14639
[ ' ] = 38770 | 53409
[ ( ] = 105671 | 159080
[ ) ] = 307 | 159387
[ * ] = 11556 | 170943
[ ] = 240 | 171183
[ , ] = 0 | 171183
[ - ] = 14565 | 185748
[ . ] = 27 | 185775
[ / ] = 2010 | 187785
[ 0 ] = 5489 | 193274
[ 1 ] = 51256 | 244530
[ 2 ] = 41364 | 285894
[ 3 ] = 20015 | 305909
[ 4 ] = 12961 | 318870
[ 5 ] = 9864 | 328734
[ 6 ] = 7294 | 336028
[ 7 ] = 6514 | 342542
[ 8 ] = 5800 | 348342
[ 9 ] = 5525 | 353867
[ : ] = 7 | 353874
[ ; ] = 0 | 353874
[ < ] = 2433 | 356307
[ = ] = 0 | 356307
[ > ] = 226 | 356533
[ ? ] = 17 | 356550
[ @ ] = 281 | 356831
[ A ] = 375661 | 732492
[ B ] = 331981 | 1064473
[ C ] = 271228 | 1335701
[ D ] = 270206 | 1605907
[ E ] = 144476 | 1750383
[ F ] = 262067 | 2012450
[ G ] = 158453 | 2170903
[ H ] = 204592 | 2375495
[ I ] = 501327 | 2876822
[ J ] = 119037 | 2995859
[ K ] = 94295 | 3090154
[ L ] = 280855 | 3371009
[ M ] = 312797 | 3683806
[ N ] = 160272 | 3844078
[ O ] = 160304 | 4004382
[ P ] = 197434 | 4201816
[ Q ] = 19418 | 4221234
[ R ] = 163032 | 4384266
[ S ] = 494497 | 4878763
[ T ] = 461447 | 5340210
[ U ] = 51570 | 5391780
[ V ] = 79325 | 5471105
[ W ] = 269542 | 5740647
[ X ] = 6973 | 5747620
[ Y ] = 162431 | 5910051
[ Z ] = 19564 | 5929615
[ [ ] = 36976 | 5966591
[ \ ] = 0 | 5966591
[ ] ] = 199 | 5966790
[ ^ ] = 13 | 5966803
[ _ ] = 594 | 5967397
[ ` ] = 0 | 5967397
[ a ] = 59000 | 6026397
[ b ] = 39103 | 6065500
[ c ] = 23406 | 6088906
[ d ] = 17316 | 6106222
[ e ] = 9960 | 6116182
[ f ] = 27632 | 6143814
[ g ] = 15660 | 6159474
[ h ] = 21529 | 6181003
[ i ] = 43845 | 6224848
[ j ] = 7824 | 6232672
[ k ] = 5854 | 6238526
[ l ] = 25302 | 6263828
[ m ] = 25061 | 6288889
[ n ] = 17172 | 6306061
[ o ] = 29060 | 6335121
[ p ] = 11470 | 6346591
[ q ] = 1561 | 6348152
[ r ] = 10232 | 6358384
[ s ] = 42816 | 6401200
[ t ] = 72947 | 6474147
[ u ] = 6623 | 6480770
[ v ] = 1806 | 6482576
[ w ] = 57864 | 6540440
[ x ] = 969 | 6541409
[ y ] = 38921 | 6580330
[ z ] = 1544 | 6581874
[ { ] = 272 | 6582146
[ | ] = 0 | 6582146
[ } ] = 3 | 6582149
[ ~ ] = 406 | 6582555
=====================================
ASCII 32(spc)-126(~) sum = 6582555
CodePudding user response:
This may be what you're trying to do, using any awk:
$ cat tst.sh
#!/usr/bin/env bash
awk '
{
char = substr($0,1,1)
cnt[FILENAME,char]
}
END {
OFS = " : "
beg = 97
end = 122
for ( fileNr=1; fileNr<ARGC; fileNr ) {
fname = ARGV[fileNr]
print "--- " fname " ---"
for ( charNr=beg; charNr<=end; charNr ) {
char = sprintf("%c", charNr)
print char, cnt[fname,char] 0
tot[char] = cnt[fname,char]
}
}
print "--- Total ---"
for ( charNr=beg; charNr<=end; charNr ) {
char = sprintf("%c", charNr)
print char, tot[char]
}
}
' "${@:--}"
$ ./tst.sh txt1 txt2
--- txt1 ---
a : 3
b : 0
c : 0
d : 1
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 0
o : 0
p : 0
q : 0
r : 0
s : 0
t : 1
u : 0
v : 0
w : 0
x : 1
y : 0
z : 1
--- txt2 ---
a : 1
b : 1
c : 0
d : 0
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 2
o : 0
p : 2
q : 0
r : 0
s : 0
t : 0
u : 0
v : 0
w : 0
x : 0
y : 0
z : 0
--- Total ---
a : 4
b : 1
c : 0
d : 1
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 2
o : 0
p : 2
q : 0
r : 0
s : 0
t : 1
u : 0
v : 0
w : 0
x : 1
y : 0
z : 1
If you want to loop over some larger range of characters just change the beg and end variable settings.
