largefile problems - off_t

index
-summary
-history
-testscript
perl / python

download 'off_t_problem.py' here!

free as
MPL 1.0

usage:


  python off_t_problem.py [directory|files]...


# check files for being problematic, 
       or just all files in a give directory


# use --silent when running it non-interactivly.


#! /usr/bin/env python

import warnings
import os.path
import popen2
import sys
import re

# ---------------------------------------------------------------------------
# helper functions to make up shorthands that wrap inferior python api calls
errors = 0
def warn(msg, error=None):
    global errors
    errors += 1
    if error is None:
        warnings.warn("-- "+str(errors)+" --\n  "+msg, RuntimeWarning, 2)
    else:
        warnings.warn("-- "+str(errors)+" --\n  "+msg+
                      "\n  error was "+str(error), RuntimeWarning, 2)
#fu

def set(dict, name, value, result = None):
    """ used inside lambda expressions to do assignments to a dictionary """
    dict[name] = value
    if result is None: return value
    return result

def sorted(list):
    """ shallow copy the list and .sort() the result """
    newlist = list[:]
    newlist.sort()
    return newlist

# for perl chomp(str) use python str.strip() - as whitespaces includes \n
# for perl -f path use python os.path.isfile(path) - or isdir() similarly
def os_path_exists(path):
    return os.access(path, os.F_OK)
def os_path_readable(path):
    return os.access(path, os.R_OK)
def os_path_writable(path):
    return os.access(path, os.W_OK)
def os_path_executable(path):
    return os.access(path, os.W_OK)

# commands.getoutput mangles stderr/stdout, i.e. it would return Popen4()
def os_read_stdout(command):
    return popen2.Popen3(command).fromchild.read()

def Q(str):
    return "'"+str+"'"

# and remember, in python it is impossible to do something like
# while line = file.readline() .... Guido van Rossum is stupid rejecting it.
# instead you have to slurp in the entire file as an array using it like
# for line in file.readlines() .... aaarggh! the python guys did even need
# to invent extra classes to handle this common case (named xreadlines).

# P.S. python 2.3 (2003) has added `for line in file` shorthand for xreadlines,
# and python 2.4 (2005) defines sorted() doing exactly like the def above.
# ---------------------------------------------------------------------------
# beware, stupid python interprets backslashes in replace-parts only partially!
class MatchReplace:
    """ A MatchReplace is a mix of a Python Pattern and a Replace-Template """
    def __init__(self, matching, template):
        self.matching = matching
        self.template = template
        self.count = 0
    def __and__(self, string):
        return self.matching.regex.subn(self.template, string, self.count)[0]
    def __rand__(self, string):
        return self.matching.regex.subn(self.template, string, self.count)[0]
    def __iand__(self, string):
        string = self.matching.regx.subn(self.template, string, self.count)[0]
    def __rshift__(self, count):
        self.count = count ; return self
    def __rlshift__(self, count):
        self.count = count ; return self

class Match(str):
    """ A Match is actually a mix of a Python Pattern and MatchObject """
    def __init__(self, pattern = None, flags = None):
        Match.__call__(self, pattern, flags)
    def __call__(self, pattern, flags = None):
        assert isinstance(pattern, str) or pattern is None
        assert isinstance(flags, str) or flags is None
        str.__init__(self, pattern)
        self.found = None # MatchObject
        self.pattern = pattern
        if pattern is not None:
            if flags:
                self.regex = re.compile("(?"+flags+")"+self.pattern)
            else:
                self.regex = re.compile(self.pattern)
        return self
    def __truth__(self):
        return self.found is not None
    def __and__(self, string):
        self.found = self.regex.search(string)
        return self.__truth__()
    def __rand__(self, string):
        self.found = self.regex.search(string)
        return self.__truth__()
    def __rshift__(self, template):
        return MatchReplace(self, template)
    def __rlshift__(self, template):
        return MatchReplace(self, template)
    def __getitem__(self, index):
        assert self.found is not None
        return self.found.group(index)
    
# ---------------------------------------------------------------------------
# use as o.optionname to check for commandline options.
class Options:
    var = {}
    def __getattr__(self, name):
        if not self.var.has_key(name): return None
        return self.var[name]
    def __setattr__(self, name, value):
        self.var[name] = value
#:class

o = Options()
o.symbols = 1
# o.files = 1
# o.needed = 1
# o.libpath = 1
# o.detected = 1
# o.symbols = 1
o.help = """perl off_t_problem.pl [directory|files]...
    scans the given files (or all files in a directory) for its
    dynamic dependencies. The binary and all its dependencies
    are classified whether they have been compiled as largefile
    or not - depending on the existance of symbols like plain
    fopen()/lseek() or their 64bit cousins from the transitional
    largefile-API named fopen64()/lseek64() instead. When two
    executable objects have a mismatch then it gets reported!
debug options:
    --files        after parsing commandline, print the list of files
                   that will be checked for largefile mismatch
    --needed       after scanning dynamic imports of the given files
                   print the (long) list of dependencies recognized
                   which wil be scanned too for largefile mismatches
    --libpath      show the libpath that was used to resolve some of
                   of the dependencies if `ldd` was not available
    --symbols      print the number of dynamic symbols found in each
                   object while scanning them (default=ON).
    --detected     for each object that was scanned, print the
                   classification attribute -??- -32- -64- or 3264
                   (along with the dynamic symbols that made this
                   think it is of that largefile type)
    --quiet        suppress the list of classifications printed just
      or           usually before the list of largefile mismatches
    --silent    ...it does also silence some other hints usually
                   printed to the screen (--quit/--no-symbols/--smart)
    --smart        suppress largefile mismatch for a limited set of
                   known dependency libs from which only a known set
                   of algorithm functions is imported (i.e. 'zlib')
    --nonclean     for libraries that might be checked smart, show
                   the first symbol that was thought to be offending.
    --noncleanall  or actually print all the imported symbols from
                   mismatching libs that are not known to be good. """


def col(wanted, prefix):
    """ move to column - the length of the input string is taken current
        column and some spaces are returned to get the sum of 36 """
    column = len(prefix)
    if wanted <= column: return ""
    return " " * (wanted - column);
#fu
def col36(prefix):
    return col(36,prefix)
def col34(prefix):
    return col(34,prefix)

# ----------------------------------------------------------------------
class File:
    def __init__(file, name, dir = None):
        file.name = name
        file.dir = dir
#class

X = {}; file = ""; # use as X[file]

# this is the implicit libpath, as if used by ld.so to resolve imports..
L = [ "/lib", "/usr/lib", "/usr/local/lib"];

def import_ldso():
    """ fill the library path """
    filename = "/etc/ld.so.conf"
    try:
        file = open (filename, "r")
    except IOError, error:
        warn ("WARNING: import of "+filename+" failed", error)
    else:
        for line in file.readlines():
            L.append(line.strip())
        file.close()
import_ldso()

def scan_args(args):
    " scan the argument list, options and files and dirs, fill X file-hash "
    x = Match()
    old = "" # pushback of $arg
    for arg in args:
        if old == "-L":
            L.append(arg) ; old = "" ; continue
        old = ""
        if arg == "-help" or arg == "--help":
            print o.help ;  continue
        if arg & x(r"^--?(\w[\w-]*)=(.*)"):
            o.var[x[1]] = x[2] ; continue
        if arg & x(r"^--?no-([a-z].*)"):
            o.var[x[1]] = "" ; continue
        if arg & x(r"^--?([a-z].*)"):
            o.var[x[1]] = "*" ; continue
        if arg == "-L":
            old = arg ; continue
        if arg & x(r"^-L(.+)"):
            L.append (x[1]) ; continue
        if arg & x(r"^-[A-Z]"):
            warn("WARNING: illegal option "+arg)

        arg = arg.rstrip("/") # chomp dirsep

	# register the file in th %X hash - .dir says where from (debugging)
        if os.path.isfile(arg):
            if not os.path.isdir(arg) and os_path_readable(arg):
                file = os.path.realpath(arg)
                X[file] = File(file, dir = os.path.dirname(arg))
            continue

	# when a directory was given, we scan all executables in it
        if os.path.isdir(arg):
            for entry in os.listdir(arg):
                name = os.path.realpath(arg+"/"+entry)
                if os.path.isdir(name): continue
                if not os_path_readable(name): continue
                filetype = os_read_stdout("file "+name+ "2</dev/null")
                if filetype & x(r"script") or filetype & x(r"text"):
                    continue
                # the following call will skip symlinks to real files..
                # if not type & x(r"ELF"): # well, warn later on..
                X[name] = File(name, dir = arg)
	#fi
    # od argv
scan_args(sys.argv[1:])

def print_libpath(out = None):
    """ debugging - print @L list              (if --libpath seen) """
    if out is None: out = sys.stderr
    for file in L:
        print << out, "-L",file
if o.libpath: print_libpath()

def print_files(out = None):
    """ debugging - print %X files             (if --files seen) """
    if out is None: out = sys.stderr
    for file in sorted(X.keys()):
        print << out, file, "<<", X[file].dir
if o.files: print_files()

# some options imply other options...
if o.silent: o.quiet = 1
if o.silent: o.smart = 1
if o.silent: o.symbols = "" # yes, --symbols is ON by default
if o.noncleanall: o.nonclean = 1
if o.q: o.quiet = 1

# __________________ detect dynamic library imports _________________


def detect_needed(file):
    """ register library imports in $X{$file}{needed}{*} """
    file.needed = {}

    # `ldd` prints a nice list of import libs and how they resolve
    header = ""
    try:
        if not o.noldd:
            header = os_read_stdout("ldd "+file.name+" 2</dev/null")
        header & Match(r"(?mx) ^\s+(\S+)\s+[=][<]\s+(\S+)") << (
            lambda x : set(file.needed, x.group(1), x.group(2), "") )
        if file.needed: return
    except Exception, e:
        print "ERROR ldd "+file.name+":\n"+header+"\n\n", str(e)
        return

    # when there was nothing seen by `ldd` then try again with objdump.
    # however, "objdump -p" shows lib imports but not how they resolve...
    try:
        header = os_read_stdout("objdump -p "+file.name+" 2</dev/null")
        header & Match(r"(?mx) ^\s+NEEDED\s+(\S+)") << (
            lambda x : set(file.needed, x.group(1), "", ""))
        # unless $1 eq "NEEDED"
    except Exception, e:
        print "ERROR objdump -p "+file.name+":\n"+header+"\n\n", str(e)
        return

    # without ldd, we need to resolve the libimports ourselves
    for lib in file.needed.keys():
        if file.needed[lib]: continue
        for dir in L: # walk -L libpath
            if os.path.isfile(dir+"/"+lib):
                file.needed[lib] = dir+"/"+lib ; break
    return
#fu

for filename in sorted(X.keys()):
    if not o.quiet: sys.stdout.write(".") ; sys.stdout.flush()
    detect_needed(X[filename])
if not o.quiet: print "\n"

def debug_needed():
    """ debugging - print imports if "--needed" was seen """
    if o.needed:
        for file in sorted(X.keys()):
            for lib in sorted(X[file].needed.keys()):
                print << sys.stderr, "OBJ", file, "-" \
                      ,lib,"=<",Q(X[file].needed[lib])
debug_needed()

# _____________________ classify each object  ___________________________

R = {}; lib = "" # use as R[lib] - it's a cache storing classifications.

# compare with largefile specs at http://ftp.sas.com/standards/large.file
# differences detected by 64on32bits hints, about section 4 of the
# http://ftp.sas.com/standards/large.file/specs/api+.006.ps

base64 = [ "creat64", "open64", "ftw64", "nftw64", "fgetpos64",
           "fopen64", "freopen64", "fseeko64", "fsetpos64", 
           "ftello64", "tmpfile64", "mmap64", "fstat64",
           "lstat64", "stat64", "statvfs64", "fstatvfs64",
           "lockf64", "lseek64", "ftruncate64", "truncate64",
           "aio_read64", "aio_write64", "lio_listio64", "aio_erro64",
           "aio_return64", "aio_cancel64", "aio_suspend64",
           # these have been seen in the wild as well...
           "mkstemp64", "tmpfile64", "readdir64", 
           "pread64", "pwrite64", "sendfile64" ]

def classifyRlib(lib):
    """ this routine is run for all %X files and all their X[file].needed[*]
        dependencies - it stores the information into the %R cache for each
        one. We also check the list of exported/imported symbols along """
    class SymTable:
        def __init__(self):
            self.symlist = {}
            self.is32 = ""
            self.is64 = ""
            self.import3264 = ""
        def sym(self, name):
            if self.symlist.has_key(name):
                return self.symlist[name]
            return None
        def add(self,name,value):
            self.symlist[name] = value
            return ""
        def symbols(self):
            return self.symlist
    global R    
    R[lib] = SymTable()
    
    if lib.startswith("("):
        print << sys.stdout, "ignored:", lib
        return
    # read the dynamic symbol table (slow!) and register in $R{$lib}{sym}{*}
    dynamicsymbols = os_read_stdout("objdump -T "+lib)
    dynamicsymbols & Match(r"(?mx) ^ (.*) \s+ ([\w_]\w+) \s*$") << (
        lambda x : R[lib].add(x.group(2), x.group(1)))

    if o.symbols and R[lib].symbols():
	print << sys.stderr,"symbols:",lib, \
              col36(lib), len(R[lib].symbols())

    def imported(str):
        return str.find("*UND*") <= 0

    for sym in base64: # foreach known ..64 symbol from the largefile-API
        sym &= Match(r"64$") << ""
        if R[lib].sym(sym+"32"): continue
        if R[lib].sym(sym+"64"): R[lib].is64 += " "+sym+"64"
        if R[lib].sym(sym):      R[lib].is32 += " "+sym+".."
        if R[lib].sym(sym) and R[lib].sym(sym+"64"):
            if imported(R[lib].sym(sym)) and imported(R[lib].sym(sym+"64")):
                R[lib].import3264 += " "+sym+"64/"+sym
    #od

    if R[lib].is32: return
    
    # secondly - if the library/binary is itself _64 and does also export
    # functions in traditional dualmode-style (none/none64) then declare
    # them _32 as well - effectivly classifying it as a 3264 dualmode object
    x = Match()
    for sym in R[lib].symbols().keys():
        if not sym & x(r"\w[\w_]+\w\w64$") or sym & x(r"(_int|Int)64$"):
            continue                            # for each symbol like "\w+64" 
        sym &= x(r"64$") << ""                  # which exports a cousin symbol
        if not R[lib].sym(sym): continue        # without the "64" suffix....
        if imported(R[lib].sym(sym)): continue  

	number = -1;               # sanity check: there is no other symbol 
        for num in xrange(0,1024): # with a number suffix, esp. no sym+"32"
            if num == 64: continue # or sym+"65" but we test all up to 1024
            if R[lib].sym(sym+"%i"%num):
                number = num ; break # --< continue outer loop
        if number < 0:
            # okay, this $lib looks like exporting 3264 dualmode symbols..
            if not len(R[lib].is32):    R[lib].is32 = " "*len(R[lib].is64) 
            if R[lib].sym(sym+"64"):    R[lib].is64 += " "+sym+"64" 
            if R[lib].sym(sym):         R[lib].is32 += " "+sym+".." 
    #od
#fu 

# the function above was defined as "fu", now let's walk all the binaries
# and imported libraries, and classify whether they are _32 or _64 (or both)
for file in X.keys():
    classifyRlib (file)
    for importlib in X[file].needed.keys():
	lib =  X[file].needed[importlib]
        if R.has_key(lib): continue # already classified
	classifyRlib (lib);
#od
if o.symbols:
    print << sys.stderr,"\n" # (done with scanning/reading object files)
# .........................................................................

def printRlib(lib, out):
    """ helper: print the classifyRlib result of a given Rlib to STDOUT """
    if out is None: out = sys.stdout
    if R[lib].is32:
        if R[lib].is64:
	    print << out,"imports:",lib, col36(lib),"32++",R[lib].is32.lstrip()
	    print << out,"imports:",lib, col36(lib),"++64",R[lib].is64.lstrip()
        else:
	    print << out,"imports:",lib, col36(lib),"-32-",R[lib].is32.lstrip()
    else:
        if R[lib].is64:
            print << out,"imports:",lib, col36(lib),"-64-",R[lib].is64.lstrip()
        else:
            print << out,"imports:",lib, col36(lib),"-??-"
#fu

def Rtyp(lib):
    """ helper - subset of above, only 4char classify-code is returned """
    if R[lib].is32:
        if R[lib].is64:
            return "3264"
        else:
            return "-32-"
    else:
        if R[lib].is64:
            return "-64-"
        else:
            return "-??-"
#fu

def debug_detected():
    """ print classifyRlib results to STDOUT if "--detected" was seen """
    if o.detected:
        for lib in sorted(R.keys()):
            if lib & Match(r".*/libc[.]so[.]\d+$"):
                continue
            printRlib (lib, sys.stderr);
debug_detected()

# _______________________ smart helper function _____________________
# some dependencies should not provoke a mismatch even that the
# libraries themselves do mismatch in their largefile mode - that is
# the case when only algorithm functions are imported that would not
# trigger access to any filedescriptor - `zlib` is a good example.
#
# implementation: for a known set of dependent libraries, we can check
# which symbols have been imported from it. We know about those imports 
# of algorithms that are acceptable. If only these were seen, then the 
# import dependency turns out to be notoffending, i.e. it is "(clean)".

goodimports = { "libz" : [ r"deflate\w*", r"inflate\w*", 
                           r"compress\w*", r"uncompress\w*",
                           r"\w+32", r"zError", r"zlibVersion"],
                # only file-reference: poptReadConfigFile(...,name)
                "libpopt" : [ r"popt[A-Z](?:\w(?!File))*" ],
                "libutil" : [ r"(open|fork)pty",
                              r"log(in|out|wtmp|in_tty)" ],
                "libdv" : [ r"\w*" ], # only encode/decode memory buffers
                "libpam" : [ r"\w*" ], # only memory buffer checking
                "libnsl" : [ r"\w*" ], # only NIS registry nonfs readwrite
                "libhistory" : [ r"\w*" ], # a.k.a. readline
                "libreadline" : [ r"readline", "add_history" ],
                "libXpm" : [ r"XpmCreatePixmapFromData" ],
                "libssl" : [ r"SSL_\w*" ],
                "libfreetype" : [ r"\w*" ],
                "libXt" : [ r"Xt(\w(?!Input))*" ],
                "libXm" : [ r"_?Xm\w*" ],
                "libldap" : [ r"ldap_domain2hostlist", r"ldap_err2string" ],
                ".." : [ "<<" ] }
def notoffending(bin,lib):
    if not R[bin].is64 or not R[lib].is32: return 0
    library = ""
    x = Match()
    for known in goodimports.keys():
        if "/"+lib & x(r"/%s"+known+r"[.]so\b[^/]*$"):
            library = known; break
    # if not library and not o.nonclean: return 0
    if not library: library = ".."
    
    offending = ""
    for sym in R[lib].symbols().keys():
        if (R[lib].sym(sym) & x(r"[*]UND[*]") or   # $lib imports(!!) it.
            sym & x(r"^_\w+_*") or         # compiler symbols / hidden symbols
            sym & x(r"^\d") or             # hmmm, does exist sometimes
            sym & x(r"^[A-Z_]+[.]\w+") or  # a dot in the middle, "GLIBC_2.1"
            sym & x(r"^\s*$") or           # empty, some extra info line
            not R[bin].sym(sym) ):
            continue

	# the symbol is exported(!!) by $lib and it exists in $bin....

        for known in goodimports[library]:
            if sym & x(r"^"+known+"$"): # if it's a known goodimports symbol 
                sym = ""; break       # then clean it - it's not offending.
        if sym:                       # otherwise, we have an offending symbol.
	    offending += '"'+sym+'" '
            if not o.noncleanall: break
	#od
    #od
    if not offending: return 1        # imports only known good symbols.

    if library == ".." : library = lib
    if o.nonclean:    print "nonclean:"+bin,col36(bin),"(64-<<-32)",
    if o.nonclean:    print library,
    if o.noncleanall: print "(not clean?)"
    if o.nonclean:    print offending
    return 0; # found symbols not in the goodlist, return FALSE.
#fu

# ___________________ show largefile-mode mismatches __________________
# we walk the %X{file}s twice - we check out all the largefile mismatches
# and register them in the %offending hash. When done, then we print the
# Rlib classification of these, so that the reader can have an eyeball
# check if that is actually done right. Finally, go over the list for
# real and print the largefile mismatches - as an extension some of the
# largefile-mismatches are marked "(clean)" when the `notoffending`-helper
# functions knows that the $bin file does not import any symbol from its
# dependency $lib that could trigger some file access. So, even that there
# is a mismatch, it does not matter for there will be no non-largefile-mode
# access to the filesystem effectivly. using "--smart" or "--silent" will
# suppress these lines completely from output to the user screen.
offending = {}

def mismatch(file, lib):
    # okay: -64-<<-64- 3264<<-64- 3264<<3264 and -32-<<-32- -32-<<3264
    # else: mismatch:  3264<<-32- -64-<<-32- and -32-<<-64-
    if ( not R[file].is32 and not R[file].is64 ): return 0
    if ( not R[lib].is32  and not R[lib].is64  ): return 0
    if (     R[file].is64 and     R[lib].is64  ): return 0
    if (     R[file].is32 and     R[lib].is32 and 
        not  R[file].is64  ):                     return 0
    return 1

def compute_mismatches():
    offending = {}
    for file in X.keys():              # register the largefile mismatches
        for importlib in X[file].needed.keys():
            lib = X[file].needed[importlib]
            if not mismatch(file, lib): continue
            if o.smart and notoffending (file, lib): continue
            # mark = ""; mark=" (clean)" if notoffending (file,lib)
            # print file,col36(file),Rtyp(file)+"<<"+Rtyp(lib),lib,mark
            offending[lib] = "!"       # register both, so that we'll see the
            offending[file] = "!"      # Rlib classification of both of them.
        #od
    #od
    for file in X.keys():
        if R[file].import3264:
            offending[file] = "!"
    return offending
offending = compute_mismatches()

def printRlib_forall(libs, out = None):
    """ and here we print the Rlib classification """
    printed = 0
    for lib in libs:
        printRlib (lib, out)
        printed += 1
    return printed

if not o.quiet:            # unless however "--quiet" or "--silent" seen.
    mismatches = printRlib_forall( sorted(offending.keys()) )
    if not mismatches and not o.silent:
        print "no largefile mismatch found :-)\n"
    if not mismatches:
        sys.exit(0) # note: the last line of this script reads "exit 1" :-)
#fi

have_weirdos = []
def print_offending_import3264(out = None):
    """ here we show all the miscompiled libraries """
    global have_weirdos
    if out is None: out = sys.stdout
    for lib in sorted(offending.keys()):
        if R[lib].import3264:
            print << out, "weirdos:",lib,col34(lib) \
                  ,"IMPORTS",R[lib].import3264.lstrip()
            have_weirdos.append(os.path.basename(lib))
    return len(have_weirdos)
if not o.quiet:
    if print_offending_import3264():
        print "WARNING: importing both 32bit and 64bit off_t symbols" \
              " is very very dangerous!"

have_badlinks = 0
have_cleanlinks = 0
def print_mismatches():
    global have_badlinks, have_cleanlinks
    for file in sorted(X.keys()):        # now show the largefile mismatches
        for importlib in sorted(X[file].needed.keys()):
            lib = X[file].needed[importlib]
            if not mismatch(file,lib): continue
            if o.smart and notoffending (file, lib): continue
            have_badlinks += 1
            if notoffending (file, lib):
                have_cleanlinks += 1
                print "badlink:",file,col36(file) \
                      ,Rtyp(file)+"<<"+Rtyp(lib),lib,"(clean)"
            else:
                print "Badlink:",file,col36(file) \
                      ,Rtyp(file)+"<<"+Rtyp(lib),lib
            # offending[lib] = "!"
            # offending[file] = "!"
        #od
    #od
print_mismatches()

def print_summary():
    global have_badlinks, have_cleanlinks, have_weirdos
    if len(have_weirdos):
        print "summary: found",len(have_weirdos) \
              , " weirdos - too dangerous to use them: (file bug report!)"
        if o.nonclean:
            line = ""
            for item in have_weirdos:
                if len (line+" "+item) < 70:
                    print "summary: ("+line[:-1]+")"
                    line = ""
                line += item+" "
            if line: print "summary: ("+line[:-1]+")"
    print "summary: found",have_badlinks \
          , "badlinks to be checked closer (",have_cleanlinks,"are clean)"
    if have_badlinks and not o.nonclean: 
        print "summary: check symbols with --nonclean or even --noncleanall"
print_summary()

sys.exit(1) # there were some offending imports, or so it seems....