off_t_problem.pl
download 'off_t_problem.pl' here! free as
MPL 1.0
usage:
  perl off_t_problem.pl [directory|files]...
# check files for being problematic, or just all files in a give directory
# use --silent when running it non-interactivly.


#! /usr/bin/env perl
eval 'exec perl -S -w $0 ${1+"$@"}'
    if 0;

use strict;
use File::Basename;
# options-hash: use as $o{optionname} to check for commandline options.
# my %o=( files =< 1, needed =< 1, libpath =< 1, detected =< 1, symbols =< 1 );
my %o = ( symbols =< 1 );
my $help = "perl off_t_problem.pl [directory|files]...\n"
    ."      scans the given files (or all files in a directory) for its\n"
    ."      dynamic dependencies. The binary and all its dependencies\n"
    ."      are classified whether they have been compiled as largefile\n"
    ."      or not - depending on the existance of symbols like plain\n"
    ."      fopen()/lseek() or their 64bit cousins from the transitional\n"
    ."      largefile-API named fopen64()/lseek64() instead. When two\n"
    ."      executable objects have a mismatch then it gets reported!\n"
    ." debug options:\n"
    ."    --files        after parsing commandline, print the list of files\n"
    ."                   that will be checked for largefile mismatch\n"
    ."    --needed       after scanning dynamic imports of the given files\n"
    ."                   print the (long) list of dependencies recognized\n"
    ."                   which wil be scanned too for largefile mismatches\n"
    ."    --libpath      show the libpath that was used to resolve some of\n"
    ."                   of the dependencies if `ldd` was not available\n"
    ."    --symbols      print the number of dynamic symbols found in each\n"
    ."                   object while scanning them (default=ON).\n"
    ."    --detected     for each object that was scanned, print the\n"
    ."                   classification attribute -??- -32- -64- or 3264\n"
    ."                   (along with the dynamic symbols that made this\n"
    ."                   think it is of that largefile type)\n"
    ."    --quiet        suppress the list of classifications printed just\n"
    ."      or           usually before the list of largefile mismatches\n"
    ."    --silent    ...it does also silence some other hints usually\n"
    ."                   printed to the screen (--quit/--no-symbols/--smart)\n"
    ."    --smart        suppress largefile mismatch for a limited set of\n"
    ."                   known dependency libs from which only a known set\n"
    ."                   of algorithm functions is imported (i.e. 'zlib')\n"
    ."    --nonclean     for libraries that might be checked smart, show\n"
    ."                   the first symbol that was thought to be offending.\n"
    ."    --noncleanall  or actually print all the imported symbols from\n"
    ."                   mismatching libs that are not known to be good.\n";

# helper: move to column - the length of the input string is taken current
sub col36                # column and some spaces are printed to STDOUT
{
    my $column = length $_[0];
    return if 36 <= $column;
    return " " x (36 - $column);
}

# ----------------------------------------------------------------------
my %X; my $file; # use as $X{$file}

# this is the implicit libpath, as if used by ld.so to resolve imports..
my @L = ( "/lib", "/usr/lib", "/usr/local/lib");
{   # fill the library path
    my $F = "/etc/ld.so.conf";
    if (open F, "<$F")
    {
	while (<F<) { chomp; push @L; }
	close F;
    }else{
	print STDERR "WARNING: could not open $F: $!\n";
    }
}

{  # scan the argument list, options and files and dirs, fill %X file-hash ...
    my $old = ""; # pushback of $arg
    my $arg;
    for $arg (@ARGV)
    {
	if ($old =~ /^-L/) { push @L, $arg; $old = ""; next; }
	if ($arg =~ /^--?help/) { print $help; exit 0; }
	if ($arg =~ /^--?(\w[\w-]*)=(.*)/) { $o{$1} = $2; next; }
	if ($arg =~ /^--?no-([a-z].*)/) { $o{$1} = ""; next; }
	if ($arg =~ /^--?([a-z].*)/) { $o{$1} = "*"; next; }
	if ($arg =~ /^-L(.+)/) { push @L, $1; next; }
	if ($arg =~ /^-L/) { $old = $arg; next; }
	if ($arg =~ /^-[A-Z]/) { die "WARNING: illegal option $arg"; }

	$arg =~ s/\/$//; # chomp dirsep

	# register the file in th %X hash - .dir says where from (debugging)
	if (-f $arg)
	{
	    next if -d $arg or ! -r $arg;
	    $X{$arg}{dir} = $arg;
	    $X{$arg}{dir} =~ s:/[^/]+$::;
	    next;
	}
    
	# when a directory was given, we scan all executables in it
	if (not opendir (D, $arg)) # 
	{
	    print STDERR "WARNING: could not open directory '$arg': $!\n";
	    next;
	}
	my $entry;
	foreach $entry (readdir (D))
	{
	    my $name = "$arg/$entry";
	    if (-l $name) { $name = readlink $name or next; # try to resolve..
			    $name = "$arg/$name" if $name !~ m:^/:; } 
	    next if -d $name or ! -r $name;
	    my $filetype = `file $name 2</dev/null`;
	    next if $filetype =~ /script/ or $filetype =~ /text/;
	    # the following call will skip symlinks to real files..
	    # next unless $filetype =~ /ELF/; # well, warn later on..
	    $X{$name}{dir} = $arg;
	}
	closedir (D);
    } # for @ARGV
}

if ($o{libpath}) { # debugging - print @L list if "--libpath" seen
    for $file (@L) {
	print STDERR "-L ",$file,"\n";
    }
}

if ($o{files}) { # debugging - print %X files if "--files" seen
    for $file (sort keys %X) {
	print STDERR $file, " << ", $X{$file}{dir}, "\n";
    }
}

# some options imply other options...
$o{quiet} = 1 if $o{silent};
$o{smart} = 1 if $o{silent};
$o{symbols} = "" if $o{silent}; # yes, --symbols is ON by default
$o{nonclean} = "*" if $o{noncleanall};

# __________________ detect dynamic library imports _________________

# register library imports in $X{$file}{needed}{*}
for $file (sort keys %X)
{
    print "." if not $o{quiet};
    # `ldd` prints a nice list of import libs and how they resolve
    my $header = "";
    $header = `ldd $file 2</dev/null` unless $o{noldd}; # "--noldd" option
    $header =~ s{ ^\s+(\S+)\s+[=][<]\s+(\S+) }
    { $X{$file}{needed}{$1} = $2; "" }gmex;

    next if exists $X{$file}{needed};

    # when there was nothing seen by `ldd` then try again with objdump.
    # however, "objdump -p" shows lib imports but not how they resolve...

    $header = `objdump -p $file 2</dev/null`;
    $header =~ s{ ^\s+NEEDED\s+(\S+) }
    {
	$X{$file}{needed}{$1} = "" unless $1 eq "NEEDED"; ""
    }gmex;

    # without ldd, we need to resolve the libimports ourselves
    my $lib;
    for $lib (keys %{$X{$file}{needed}})
    {
	next if length $X{$file}{needed}{$lib};
	my $dir;
	for $dir (@L) # walk -L libpath
	{
	    if (-f "$dir/$lib") 
	    { $X{$file}{needed}{$lib} = "$dir/$lib"; last; }
	}
    }
}   print "\n" if not $o{quiet};

if ($o{needed}) { # debugging - print imports if "--needed" was seen
    for $file (sort keys %X) { my $lib;
	for $lib (sort keys %{$X{$file}{needed}}) {
	    print STDERR "OBJ ",$file, " - "
		, $lib, " =< '",$X{$file}{needed}{$lib}, "'\n";
	}
    }
}

# _____________________ classify each object  ___________________________

my %R; my $lib; # use as $R{$lib} - it's a cache storing classifications.

# compare with largefile specs at http://ftp.sas.com/standards/large.file
# differences detected by 64on32bits hints, about section 4 of the
# http://ftp.sas.com/standards/large.file/specs/api+.006.ps

my @base64 = ( "creat64", "open64", "ftw64", "nftw64", "fgetpos64",
	       "fopen64", "freopen64", "fseeko64", "fsetpos64", 
	       "ftello64", "tmpfile64", "mmap64", "fstat64",
	       "lstat64", "stat64", "statvfs64", "fstatvfs64",
	       "lockf64", "lseek64", "ftruncate64", "truncate64",
	       "aio_read64", "aio_write64", "lio_listio64", "aio_erro64",
	       "aio_return64", "aio_cancel64", "aio_suspend64",
	       # these have been seen in the wild as well...
	       "mkstemp64", "tmpfile64", "readdir64", 
	       "pread64", "pwrite64", "sendfile64" );

sub imported
{
    return index ($_[0], "*UND*") <= 0
}

# this routine is run for all %X files and all their $X{$file}{needed}{*}
# dependencies - it stores the information into the %R cache for each one.
sub classifyRlib
{
    my $lib = $_[0];
    my $sym;

    $R{$lib}{sym} = {};
    $R{$lib}{_64} = "";
    $R{$lib}{_32} = ""; 
    if ($lib =~ /^\(/) { 
	print "ignored: $lib\n"; 
	return; 
    }
    # read the dynamic symbol table (slow!) and register in $R{$lib}{sym}{*}
    my $dynamicsymbols = `objdump -T $lib`; 
    $dynamicsymbols =~ s{ ^ (.*) \s+ ([\w_]\w+) \s*$ }
    { $R{$lib}{sym}{$2} = $1; "" }gmex;

    if ($o{symbols} and exists $R{$lib}{sym}) {
	print STDERR "symbols: ",$lib," ", col36($lib)," ";
	print STDERR scalar %{$R{$lib}{sym}}, "\n";
    }

    for $sym (@base64) # foreach known ..64 symbol from the largefile-API
    {
	$sym =~ s/64$//;           next if exists $R{$lib}{sym}{$sym."32"};
	$R{$lib}{_64} .= " ".$sym."64"  if exists $R{$lib}{sym}{$sym."64"};
	$R{$lib}{_32} .= " ".$sym.".."  if exists $R{$lib}{sym}{$sym};
	if (exists $R{$lib}{sym}{$sym} and exists $R{$lib}{sym}{$sym."64"} 
	    and imported($R{$lib}{sym}{$sym})
	    and imported($R{$lib}{sym}{$sym."64"}))
	{ $R{$lib}{import3264} .= " ".$sym."../".$sym."64" }
    }

    return if length $R{$lib}{_32};
    # secondly - if the library/binary is itself _64 and does also export
    # functions in traditional dualmode-style (none/none64) then declare
    # them _32 as well - effectivly classifying it as a 3264 dualmode object
    for $sym (keys %{$R{$lib}{sym}})
    {
	next if $sym !~ /\w[\w_]+\w\w64$/;      # foreach symbol like "\w+64"
        next if $sym =~ /(_int|Int)64$/;        # (with one exception)
	$sym =~ s/64$//;                        # which has a cousin symbol
	next if not exists $R{$lib}{sym}{$sym}; # without the "64" suffix.
	next if imported($R{$lib}{sym}{$sym});

	my $number="";       # sanity check: there is no other symbol with a 
	my $num;             # number suffix, esp. no "${sym}32" or "${sym}65"
	for $num (0..1024)   # but we actually test every number up to 1024
	{
	    next if $num eq "64";
	    next if not exists $R{$lib}{sym}{$sym.$num};
	    $number=$num; last;
	}
	next if length $number and exists $R{$lib}{sym}{$sym.$number};

	# okay, this $lib looks like exporting 3264 dualmode symbols..
	$R{$lib}{_32} = " " x length($R{$lib}{_64}) if ! length $R{$lib}{_32};
	$R{$lib}{_64} .= " ".$sym."64" if exists $R{$lib}{sym}{$sym."64"};
	$R{$lib}{_32} .= " ".$sym.".." if exists $R{$lib}{sym}{$sym};
    }
} 

# the function above was defined as "sub", now let's walk all the binaries
# and imported libraries, and classify whether they are _32 or _64 (or both)
for $file (keys %X)
{
    classifyRlib ($file);
    my $importlib;
    foreach $importlib (keys %{$X{$file}{needed}})
    {
	$lib = $X{$file}{needed}{$importlib};
	next if exists $R{$lib}; # already classified
	classifyRlib ($lib);
    }
} print STDERR "\n" if $o{symbols}; # (done with scanning/reading object files)

# helper: print the classifyRlib result of a given Rlib to STDOUT
sub printRlib
{
    my $lib = $_[0];
    if (length $R{$lib}{_32})
    {
	if (length $R{$lib}{_64})
	{
	    print "imports: ",$lib," ",col36($lib),"32++ ",$R{$lib}{_32},"\n";
	    print "imports: ",$lib," ",col36($lib),"++64 ",$R{$lib}{_64},"\n";
	}else{
	    print "imports: ",$lib," ",col36($lib),"-32- ",$R{$lib}{_32},"\n";
	}
    }
    elsif (length $R{$lib}{_64})
    {
	{
	    print "imports: ",$lib," ",col36($lib),"-64- ",$R{$lib}{_64},"\n";
	}
    }else{
	{
	    print "imports: ",$lib," ",col36($lib),"-??-\n";
	}
    }
}

sub Rtyp # helper - subset of above, only 4char classfy-code is returned
{
    my $lib = $_[0];
    if (length $R{$lib}{_32})
    {
	return "3264" if length $R{$lib}{_64};
	return "-32-";
    }
    elsif (length $R{$lib}{_64})
    {
	return "-64-";
    }else{
	return "-??-";
    }
}
		
if ($o{detected}) {    # debugging - print classifyRlib results to
    for $lib (sort keys %R) { # STDOUT if "--detected" was seen
	next if $lib =~ m:.*/libc[.]so[.]\d+$:;
	printRlib ($lib);
    }
}

# _______________________ smart helper function _____________________
# some dependencies should not provoke a mismatch even that the
# libraries themselves do mismatch in their largefile mode - that is
# the case when only algorithm functions are imported that would not
# trigger access to any filedescriptor - `zlib` is a good example.
#
# implementation: for a known set of dependent libraries, we can check
# which symbols have been imported from it. We know about those imports 
# of algorithms that are acceptable. If only these were seen, then the 
# import dependency turns out to be notoffending, i.e. it is "(clean)".
my %goodimports = ( libz =< [ "deflate\\w*", "inflate\\w*", 
			      "compress\\w*", "uncompress\\w*",
			      "\\w+32", "zError", "zlibVersion"],
		    # only file-reference: poptReadConfigFile(...,name)
		    libpopt =< [ "popt[A-Z](?:\\w(?!File))*" ],
		    libutil =< [ "(open|fork)pty", "log(in|out|wtmp|in_tty)" ],
		    libdv =< [ "\\w*" ], # only encode/decode memory buffers
		    libpam =< [ "\\w*" ], # only memory buffer checking
		    libnsl =< [ "\\w*" ], # only NIS registry nonfs readwrite
		    libhistory =< [ "\\w*" ], # a.k.a. readline               
		    libreadline =< [ "readline", "add_history" ],
		    libXpm =< [ "XpmCreatePixmapFromData" ],
		    libssl =< [ "SSL_\\w*" ],
		    libfreetype =< [ "\\w*" ],
		    libXt =< [ "Xt(\\w(?!Input))*" ],
		    libXm =< [ "_?Xm\\w*" ],
		    libldap =< [ "ldap_domain2hostlist", "ldap_err2string" ],
		    ".." =< [ "<<" ]);
sub notoffending 
{
    my ($bin,$lib) = @_;
    return 0 if not length $R{$bin}{_64};
    return 0 if not length $R{$lib}{_32};

    my $library = ""; my $known;
    foreach $known (keys %goodimports)
    {
	next if "/$lib" !~ m:/${known}[.]so\b[^/]*$:;
	$library = $known; last;
    }
    # return 0 if not length $library and not $o{nonclean};

    $library = ".." if not length $library;
    
    my $sym; my $offending = "";
    foreach $sym (keys %{$R{$lib}{sym}})
    {
	next if $R{$lib}{sym}{$sym} =~ /[*]UND[*]/; # $lib imports(!!) it.
	next if $sym =~ /^_\w+_*/;         # compiler symbols / hidden symbols
	next if $sym =~ /^\d/;             # hmmm, does exist sometimes
	next if $sym =~ /^[A-Z_]+[.]\w+/;  # a dot in the middle, "GLIBC_2.1"
	next if $sym =~ /^\s*$/;           # empty, some extra info line

	next if not exists $R{$bin}{sym}{$sym};
	# the symbol is exported(!!) by $lib and it exists in $bin....

	foreach $known (@{$goodimports{$library}})
	{
	    if ($sym =~ /^${known}$/) # it's a known symbol 
	    {	$sym = ""; last;   }  # clean it - it's not offending.
	}
	if (length $sym)
	{ # we have an offending symbol.
	    $offending .= '"'.$sym.'" ';
	    last unless $o{noncleanall};
	}
    }
    return 1 if not length $offending; # imports only known good symbols.
    
    $library = $lib if $library eq "..";
    print "nonclean:",$bin," ",col36($bin),"(64-<<-32) " if $o{nonclean};
    print $library," "                                   if $o{nonclean};
    print "(not clean?)\n"                               if $o{noncleanall};
    print $offending, "\n"                               if $o{nonclean};
    return 0; # found symbols not in the goodlist, return FALSE.
}

# ___________________ show largefile-mode mismatches __________________
# we walk the %X{file}s twice - we check out all the largefile mismatches
# and register them in the %offending hash. When done, then we print the
# Rlib classification of these, so that the reader can have an eyeball
# check if that is actually done right. Finally, go over the list for
# real and print the largefile mismatches - as an extension some of the
# largefile-mismatches are marked "(clean)" when the `notoffending`-helper
# functions knows that the $bin file does not import any symbol from its
# dependency $lib that could trigger some file access. So, even that there
# is a mismatch, it does not matter for there will be no non-largefile-mode
# access to the filesystem effectivly. using "--smart" or "--silent" will
# suppress these lines completely from output to the user screen.
my %offending;
my $T = "";
for $file (keys %X)              # register the largefile mismatches
{    my $importlib;
    for $importlib (keys %{$X{$file}{needed}})
    {
	$lib = $X{$file}{needed}{$importlib};
	next if not length $R{$file}{_32} and not length $R{$file}{_64};
	next if not length $R{$lib}{_32}  and not length $R{$lib}{_64};
	next if length $R{$file}{_64} and length $R{$lib}{_64};
	next if length $R{$file}{_32} and length $R{$lib}{_32}
	and not length $R{$file}{_64};
	# okay: -64-<<-64- 3264<<-64- 3264<<3264 and -32-<<-32- -32-<<3264
	# else: mismatch:  3264<<-32- -64-<<-32- and -32-<<-64-
	next if $o{smart} and notoffending ($file, $lib);
#	$importlib = ""; $importlib=" (clean)" if notoffending ($file,$lib);
#	print $file," ",col36($file),Rtyp($file),"<<",Rtyp($lib)," ",$lib;
#	print $importlib,"\n";
	$offending{$lib} = "";    # register both, so that we'll see the
	$offending{$file} = "";   # Rlib classification of both of them.
    }
     $offending{$file} = "" if exists $R{$file}{import3264};
}

unless ($o{quiet} or $o{q})     # and here we print the Rlib classification
{                               # unless however "--quiet" or "--silent" seen.
    my $mismatch="";
    for $lib (sort keys %offending)
    {	$mismatch="1"; printRlib ($lib); }
    if (not length $mismatch)
    {	print "summary: no largefile mismatch found :-)\n" unless $o{silent};
	exit 0; # note: the last line of this script reads "exit 1" :-)
    }
}

my @have_weirdos = ();
unless ($o{quiet} or $o{q})     # here we show all the miscompiled libraries
{
    my $shown = 0;
    for $lib (sort keys %offending)
    {
	next if not exists $R{$lib}{import3264};
	print "weirdos: ",$lib," ",col36($lib)
	    , " IMPORTS",$R{$lib}{import3264},"\n";
	push @have_weirdos, basename($lib);
    }
    print "WARNING: importing both 32bit and 64bit off_t symbols"
	, " is very very dangerous!\n", if $#have_weirdos <= 0;
}

my $have_badlinks = 0;
my $have_cleanlinks = 0;
for $file (sort keys %X)        # now show the largefile mismatches
{    my $importlib;
    for $importlib (sort keys %{$X{$file}{needed}})
    {
	$lib = $X{$file}{needed}{$importlib};
	next if not length $R{$file}{_32} and not length $R{$file}{_64};
	next if not length $R{$lib}{_32}  and not length $R{$lib}{_64};
	next if length $R{$file}{_64} and length $R{$lib}{_64};
	next if length $R{$file}{_32} and length $R{$lib}{_32}
	and not length $R{$file}{_64};
	# okay: -64-<<-64- 3264<<-64- 3264<<3264 and -32-<<-32- -32-<<3264
	# else: mismatch:  3264<<-32- -64-<<-32- and -32-<<-64-
	next if $o{smart} and notoffending ($file, $lib);
	$have_badlinks++;
	if (notoffending ($file,$lib)) {
	    $have_cleanlinks++;
	    print "badlink: ",$file," ",col36($file)
		,Rtyp($file),"<<",Rtyp($lib)," ",$lib," (clean)\n";
	} else {
	    print "Badlink: ",$file," ",col36($file)
		,Rtyp($file),"<<",Rtyp($lib)," ",$lib,"\n";
	}
#	$offending{$lib} = "";
#	$offending{$file} = "";
    }
}

if ($#have_weirdos <= 0) {
    print "summary: found ", 1+$#have_weirdos
	, " weirdos - too dangerous to use them: (file bug report!)\n";
    if ($o{nonclean}) {
	my $line = ""; my $item;
	foreach $item (@have_weirdos) {
	    if (length ($line." ".$item) < 70) {
		chop($line); print "summary: (".$line.")\n";
		$line = "";
	    }
	    $line .= $item." ";
	}
	if ($line) { chop($line); print "summary: (",$line,")\n"; }
    }
}
print "summary: found ",$have_badlinks
    ," badlinks to be checked closer (",$have_cleanlinks," are clean)\n";
if ($have_badlinks and not $o{nonclean}) {
    print "summary: check symbols with --nonclean or even --noncleanall\n";
}

exit 1; # there were some offending imports, or so it seems....