I have 2 files on Unix. I want to compare the files-
- To show the the rows which are missing in both files.
- To show the actual differences in both files.
- To be able to skip the columns for comparison which I don't want in report for e.g. d_report_ref_date
- My unique key for comparison is v_party_id
File 1:
d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="103811925" v_src_system_id="SMT"
d_party_default_status_date="2012-03-09" d_report_ref_date="2021-03-31" n_pd_percent="1" v_accounting_standard="SQRT" v_party_default_status_cd="UNLIKE" v_party_id="36056030" v_src_system_id="SMT"
d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="53565979" v_src_system_id="SMT"
d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="103811100" v_src_system_id="SMT"
d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="103811200" v_src_system_id="SMT"
File 2
d_report_ref_date="2021-03-31" n_pd_percent="0.2045" v_accounting_standard="SQRT" v_party_default_status_cd="NOTDFLT" v_party_id="103811925" v_src_system_id="SMT"
d_party_default_status_date="2012-03-09" d_report_ref_date="2021-03-31" n_pd_percent="1" v_accounting_standard="IFRS" v_party_default_status_cd="UNLIKE" v_party_id="36056030" v_src_system_id="SMT"
d_report_ref_date="2021-03-31" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="53565979" v_src_system_id="SMT"
d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="103811400" v_src_system_id="SMT"
Expected Output
Rows missing in file1: v_party_id="103811400"
Rows missing in file2: v_party_id="103811100", v_party_id="103811200"
Mismtach in row 1 for v_party_id="103811925": file1.n_pd_percent="0.16323687" file2.n_pd_percent="0.2045", file1.v_accounting_standard="IFRS" file2.v_accounting_standard="SQRT"
Mismtach in row 2 for v_party_id="36056030": file1.v_accounting_standard="SQRT" file2.v_accounting_standard="IFRS"
Code:
BEGIN { FS="[= ]" }
NR==FNR {
for (i=1; i<NF; i =2) {
file1[NR,i] = $(i 1)
}
next
}
{
msg = sep = ""
for (i=1; i<NF; i =2) {
if ( $(i 1) != file1[FNR,i] ) {
msg = msg sep OFS ARGV[1] "." $i "=" file1[FNR,i] OFS FILENAME "." $i "=" $(i 1)
sep = ","
}
}
if ( msg != "" ) {
print "Mismtach in row " FNR msg
}
}
Actual Output
awk -f compare.awk file1 file2
Mismtach in row 1 file1.n_pd_percent="0.16323687" file2.n_pd_percent="0.2045", file1.v_accounting_standard="IFRS" file2.v_accounting_standard="SQRT"
Mismtach in row 2 file1.v_accounting_standard="SQRT" file2.v_accounting_standard="IFRS"
Mismtach in row 3 file1.v_accounting_standard="0.16323687" file2.v_accounting_standard="IFRS", file1.v_party_default_status_cd="IFRS" file2.v_party_default_status_cd="NOTDFLT", file1.v_party_id="NOTDFLT" file2.v_party_id="53565979", file1.v_src_system_id="53565979" file2.v_src_system_id="SMT"
Mismtach in row 4 file1.v_party_id="103811100" file2.v_party_id="103811400"
What changes can I do in the code to display the output in desired format?
CodePudding user response:
This'll do what I think you want, you might want to tweak the output format:
$ cat tst.awk
BEGIN { FS="[= ]" }
{
match(" "$0,/ v_party_id="[^"] "/)
key = substr($0,RSTART,RLENGTH)
}
NR==FNR {
file1[key] = $0
next
}
{
if ( key in file1 ) {
nf = split(file1[key],tmp)
for (i=1; i<nf; i =2) {
f1[key,tmp[i]] = tmp[i 1]
}
msg = sep = ""
for (i=1; i<NF; i =2) {
if ( $(i 1) != f1[key,$i] ) {
msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i 1)
sep = ","
}
}
if ( msg != "" ) {
print "Mismatch in row " FNR msg
}
delete file1[key]
}
else {
file2[key] = $0
}
}
END {
for (key in file1) {
print "In file1 only:", key, file1[key]
}
for (key in file2) {
print "In file2 only:", key, file2[key]
}
}
$ awk -f tst.awk file1 file2
Mismatch in row 1 file1.n_pd_percent="0.16323687" file2.n_pd_percent="0.2045", file1.v_accounting_standard="IFRS" file2.v_accounting_standard="SQRT"
Mismatch in row 2 file1.v_accounting_standard="SQRT" file2.v_accounting_standard="IFRS"
In file1 only: v_party_id="103811200" d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="103811200" v_src_system_id="SMT"
In file1 only: v_party_id="103811100" d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="103811100" v_src_system_id="SMT"
In file2 only: v_party_id="103811400" d_report_ref_date="2021-03-31" n_pd_percent="0.16323687" v_accounting_standard="IFRS" v_party_default_status_cd="NOTDFLT" v_party_id="103811400" v_src_system_id="SMT"
