From: Mark Hymers Date: Wed, 23 Mar 2011 18:34:24 +0000 (+0000) Subject: metadata generation work X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f6b62be0ac52b3bc05ec48ef2c458d2fd83625b6;p=dak metadata generation work Signed-off-by: Mark Hymers --- diff --git a/dak/dak.py b/dak/dak.py index 5a659d8c..ad99a5a0 100755 --- a/dak/dak.py +++ b/dak/dak.py @@ -86,6 +86,8 @@ def init(): "Generate Packages/Sources files"), ("contents", "Generate content files"), + ("metadata", + "Load data for packages/sources files"), ("generate-index-diffs", "Generate .diff/Index files"), ("clean-suites", diff --git a/dak/packagescan.py b/dak/metadata.py similarity index 65% rename from dak/packagescan.py rename to dak/metadata.py index 2d2bab00..f40c9431 100755 --- a/dak/packagescan.py +++ b/dak/metadata.py @@ -1,10 +1,6 @@ #!/usr/bin/env python """ -Import data for Packages files from .deb files - -@contact: Debian FTPMaster -@copyright: 2008, 2009 Michael Casadevall -@copyright: 2009 Mike O'Connor +Import data for Package/Sources files from .deb and .dsc files @copyright: 2011 Torsten Werner @copyright: 2011 Mark Hymers @license: GNU General Public License version 2 or later @@ -41,18 +37,21 @@ import apt_pkg from daklib.config import Config from daklib.dbconn import * -from daklib.packages import PackagesScanner +from daklib.metadata import MetadataScanner from daklib import daklog from daklib import utils ################################################################################ def usage (exit_code=0): - print """Usage: dak packagescan [options] subcommand + print """Usage: dak metadata [options] subcommand SUBCOMMANDS - scan - scan the debs in the existing pool and load metadata into the database + scan-source + scan the dsc files in the existing pool and load metadata into the database + + scan-binary + scan the deb files in the existing pool and load metadata into the database OPTIONS -h, --help @@ -60,17 +59,17 @@ OPTIONS OPTIONS for scan -l, --limit=NUMBER - maximum number of packages to scan + maximum number of items to scan """ sys.exit(exit_code) ################################################################################ -def scan_all(cnf, limit): - Logger = daklog.Logger(cnf.Cnf, 'packages scan') - result = PackagesScanner.scan_all(limit) - processed = '%(processed)d packages processed' % result - remaining = '%(remaining)d packages remaining' % result +def scan_all(cnf, mode, limit): + Logger = daklog.Logger(cnf.Cnf, 'metadata scan (%s)' % mode) + result = MetadataScanner.scan_all(mode, limit) + processed = '%(processed)d %(type)s processed' % result + remaining = '%(remaining)d %(type)s remaining' % result Logger.log([processed, remaining]) Logger.close() @@ -78,17 +77,17 @@ def scan_all(cnf, limit): def main(): cnf = Config() - cnf['Packages::Options::Help'] = '' - cnf['Packages::Options::Suite'] = '' - cnf['Packages::Options::Limit'] = '' - cnf['Packages::Options::Force'] = '' - arguments = [('h', "help", 'Packages::Options::Help'), - ('s', "suite", 'Packages::Options::Suite', "HasArg"), - ('l', "limit", 'Packages::Options::Limit', "HasArg"), - ('f', "force", 'Packages::Options::Force'), + cnf['Metadata::Options::Help'] = '' + cnf['Metadata::Options::Suite'] = '' + cnf['Metadata::Options::Limit'] = '' + cnf['Metadata::Options::Force'] = '' + arguments = [('h', "help", 'Metadata::Options::Help'), + ('s', "suite", 'Metadata::Options::Suite', "HasArg"), + ('l', "limit", 'Metadata::Options::Limit', "HasArg"), + ('f', "force", 'Metadata::Options::Force'), ] args = apt_pkg.ParseCommandLine(cnf.Cnf, arguments, sys.argv) - options = cnf.SubTree('Packages::Options') + options = cnf.SubTree('Metadata::Options') if (len(args) != 1) or options['Help']: usage() @@ -97,8 +96,11 @@ def main(): if len(options['Limit']) > 0: limit = int(options['Limit']) - if args[0] == 'scan': - scan_all(cnf, limit) + if args[0] == 'scan-source': + scan_all(cnf, 'source', limit) + return + elif args[0] == 'scan-binary': + scan_all(cnf, 'binary', limit) return suite_names = utils.split_args(options['Suite']) diff --git a/daklib/dbconn.py b/daklib/dbconn.py index 4d30e663..98b6c7d5 100755 --- a/daklib/dbconn.py +++ b/daklib/dbconn.py @@ -492,6 +492,10 @@ class DBBinary(ORMObject): self.poolfile = poolfile self.binarytype = binarytype + @property + def pkid(self): + return self.binary_id + def properties(self): return ['package', 'version', 'maintainer', 'source', 'architecture', \ 'poolfile', 'binarytype', 'fingerprint', 'install_date', \ @@ -533,20 +537,28 @@ class DBBinary(ORMObject): ''' Reads the control information from a binary. - @rtype: tuple - @return: (stanza, controldict) stanza is the text of the control - section. controldict is the information in a dictionary - form + @rtype: text + @return: stanza text of the control section. ''' - import apt_inst, apt_pk + import apt_inst fullpath = self.poolfile.fullpath deb_file = open(fullpath, 'r') - stanza = apt_inst.debExtractControl(deb_file).rstrip() - control = dict(apt_pkg.TagSection(stanza)) + stanza = apt_inst.debExtractControl(deb_file) deb_file.close() - return stanza, control + return stanza + + def read_control_fields(self): + ''' + Reads the control information from a binary and return + as a dictionary. + @rtype: dict + @return: fields of the control section as a dictionary. + ''' + import apt_pkg + stanza = self.read_control() + return apt_pkg.TagSection(stanza) __all__.append('DBBinary') @@ -2176,6 +2188,60 @@ __all__.append('get_sections') ################################################################################ +from debian.debfile import Deb822 + +# Temporary Deb822 subclass to fix bugs with : handling; see #597249 +class Dak822(Deb822): + def _internal_parser(self, sequence, fields=None): + # The key is non-whitespace, non-colon characters before any colon. + key_part = r"^(?P[^: \t\n\r\f\v]+)\s*:\s*" + single = re.compile(key_part + r"(?P\S.*?)\s*$") + multi = re.compile(key_part + r"$") + multidata = re.compile(r"^\s(?P.+?)\s*$") + + wanted_field = lambda f: fields is None or f in fields + + if isinstance(sequence, basestring): + sequence = sequence.splitlines() + + curkey = None + content = "" + for line in self.gpg_stripped_paragraph(sequence): + m = single.match(line) + if m: + if curkey: + self[curkey] = content + + if not wanted_field(m.group('key')): + curkey = None + continue + + curkey = m.group('key') + content = m.group('data') + continue + + m = multi.match(line) + if m: + if curkey: + self[curkey] = content + + if not wanted_field(m.group('key')): + curkey = None + continue + + curkey = m.group('key') + content = "" + continue + + m = multidata.match(line) + if m: + content += '\n' + line # XXX not m.group('data')? + continue + + if curkey: + self[curkey] = content + + class DBSource(ORMObject): def __init__(self, source = None, version = None, maintainer = None, \ changedby = None, poolfile = None, install_date = None): @@ -2186,6 +2252,10 @@ class DBSource(ORMObject): self.poolfile = poolfile self.install_date = install_date + @property + def pkid(self): + return self.source_id + def properties(self): return ['source', 'source_id', 'maintainer', 'changedby', \ 'fingerprint', 'poolfile', 'version', 'suites_count', \ @@ -2195,18 +2265,15 @@ class DBSource(ORMObject): return ['source', 'version', 'install_date', 'maintainer', \ 'changedby', 'poolfile', 'install_date'] - def read_control(self): + def read_control_fields(self): ''' Reads the control information from a dsc @rtype: tuple - @return: (stanza, controldict) stanza is the text of the control - section. controldict is the information in a dictionary - form + @return: fields is the dsc information in a dictionary form ''' - from debian.debfile import Deb822 fullpath = self.poolfile.fullpath - fields = Deb822(open(self.poolfile.fullpath, 'r')) + fields = Dak822(open(self.poolfile.fullpath, 'r')) return fields metadata = association_proxy('key', 'value') @@ -2354,6 +2421,34 @@ def get_source_in_suite(source, suite, session=None): __all__.append('get_source_in_suite') +@session_wrapper +def import_metadata_into_db(obj, session=None): + """ + This routine works on either DBBinary or DBSource objects and imports + their metadata into the database + """ + fields = obj.read_control_fields() + for k in fields.keys(): + try: + # Try raw ASCII + val = str(fields[k]) + except UnicodeEncodeError: + # Fall back to UTF-8 + try: + val = fields[k].encode('utf-8') + except UnicodeEncodeError: + # Finally try iso8859-1 + val = fields[k].encode('iso8859-1') + # Otherwise we allow the exception to percolate up and we cause + # a reject as someone is playing silly buggers + + obj.metadata[get_or_set_metadatakey(k, session)] = val + + session.commit_or_flush() + +__all__.append('import_metadata_into_db') + + ################################################################################ @session_wrapper @@ -2530,7 +2625,7 @@ def add_deb_to_db(u, filename, session=None): # session.rollback() # raise MissingContents, "No contents stored for package %s, and couldn't determine contents of %s" % (bin.package, filename) - return poolfile + return bin, poolfile __all__.append('add_deb_to_db') @@ -2853,6 +2948,38 @@ class MetadataKey(ORMObject): __all__.append('MetadataKey') +@session_wrapper +def get_or_set_metadatakey(keyname, session=None): + """ + Returns MetadataKey object for given uidname. + + If no matching keyname is found, a row is inserted. + + @type uidname: string + @param uidname: The keyname to add + + @type session: SQLAlchemy + @param session: Optional SQL session object (a temporary one will be + generated if not supplied). If not passed, a commit will be performed at + the end of the function, otherwise the caller is responsible for commiting. + + @rtype: MetadataKey + @return: the metadatakey object for the given keyname + """ + + q = session.query(MetadataKey).filter_by(key=keyname) + + try: + ret = q.one() + except NoResultFound: + ret = MetadataKey(keyname) + session.add(ret) + session.commit_or_flush() + + return ret + +__all__.append('get_or_set_metadatakey') + ################################################################################ class BinaryMetadata(ORMObject): diff --git a/daklib/packages.py b/daklib/metadata.py similarity index 55% rename from daklib/packages.py rename to daklib/metadata.py index 27b6d287..d88cf4fa 100755 --- a/daklib/packages.py +++ b/daklib/metadata.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Helper code for packages generation. +Helper code for packages and sources generation. @contact: Debian FTPMaster @copyright: 2011 Torsten Werner @@ -34,17 +34,20 @@ from subprocess import Popen, PIPE import os.path -class PackagesScanner(object): +class MetadataScanner(object): ''' - PackagesScanner provides a threadsafe method scan() to scan the metadata of - a DBBinary object. - ''' - def __init__(self, binary_id): + MetadataScanner provides a threadsafe method scan() to scan the metadata of + a DBSource or DBBinary object depending on what is passed as dbclass''' + + def __init__(self, dbclass, pkid, verbose=True): ''' The argument binary_id is the id of the DBBinary object that + should be scanned. ''' - self.binary_id = binary_id + self.verbose = True + self.dbclass = dbclass + self.pkid = pkid def scan(self, dummy_arg = None): ''' @@ -52,43 +55,59 @@ class PackagesScanner(object): property. It commits any changes to the database. The argument dummy_arg is ignored but needed by our threadpool implementation. ''' + obj = None + fullpath = 'UNKNOWN PATH' + session = DBConn().session() - binary = session.query(DBBinary).get(self.binary_id) - fileset = set(binary.read_control()) - print fileset - #if len(fileset) == 0: - # fileset.add('EMPTY_PACKAGE') - #for filename in fileset: - # binary.contents.append(BinContents(file = filename)) - #session.commit() + try: + obj = session.query(self.dbclass).get(self.pkid) + fullpath = obj.poolfile.fullpath + import_metadata_into_db(obj, session=session) + if self.verbose: + print "Imported %s (%s)" % (self.pkid, fullpath) + session.commit() + except Exception, e: + print "Failed to import %s [id=%s; fullpath=%s]" % (self.dbclass.__name__, self.pkid, fullpath) + print "Exception: ", e + session.rollback() + session.close() @classmethod - def scan_all(class_, limit = None): + def scan_all(class_, scantype='source', limit = None): ''' - The class method scan_all() scans all binaries using multiple threads. - The number of binaries to be scanned can be limited with the limit - argument. Returns the number of processed and remaining packages as a + The class method scan_all() scans all sources using multiple threads. + The number of sources to be scanned can be limited with the limit + argument. Returns the number of processed and remaining files as a dict. ''' session = DBConn().session() - query = session.query(DBBinary).filter(DBBinary.contents == None) + if scantype == 'source': + dbclass = DBSource + query = session.query(DBSource).filter(~DBSource.source_id.in_(session.query(SourceMetadata.source_id.distinct()))) + t = 'sources' + else: + # Otherwise binary + dbclass = DBBinary + query = session.query(DBBinary).filter(~DBBinary.binary_id.in_(session.query(BinaryMetadata.binary_id.distinct()))) + t = 'binaries' + remaining = query.count if limit is not None: query = query.limit(limit) processed = query.count() - pool = Pool() - for binary in query.yield_per(100): - pool.apply_async(scan_helper, (binary.binary_id, )) + pool = Pool(processes=10) + for obj in query.yield_per(100): + pool.apply_async(scan_helper, (dbclass, obj.pkid, )) pool.close() pool.join() remaining = remaining() session.close() - return { 'processed': processed, 'remaining': remaining } + return { 'processed': processed, 'remaining': remaining , 'type': t} -def scan_helper(binary_id): +def scan_helper(dbclass, source_id): ''' This function runs in a subprocess. ''' - scanner = PackagesScanner(binary_id) + scanner = MetadataScanner(dbclass, source_id) scanner.scan() diff --git a/daklib/queue.py b/daklib/queue.py index b4c62d38..52483cca 100755 --- a/daklib/queue.py +++ b/daklib/queue.py @@ -2025,6 +2025,7 @@ distribution.""" print "Installing." self.logger.log(["installing changes", self.pkg.changes_file]) + binaries = [] poolfiles = [] # Add the .dsc file to the DB first @@ -2037,7 +2038,9 @@ distribution.""" # Add .deb / .udeb files to the DB (type is always deb, dbtype is udeb/deb) for newfile, entry in self.pkg.files.items(): if entry["type"] == "deb": - poolfiles.append(add_deb_to_db(self, newfile, session)) + b, pf = add_deb_to_db(self, newfile, session) + binaries.append(b) + poolfiles.append(pf) # If this is a sourceful diff only upload that is moving # cross-component we need to copy the .orig files into the new @@ -2122,6 +2125,18 @@ distribution.""" # Our SQL session will automatically start a new transaction after # the last commit + # Now ensure that the metadata has been added + # This has to be done after we copy the files into the pool + # For source if we have it: + if self.pkg.changes["architecture"].has_key("source"): + import_metadata_into_db(source, session) + + # Now for any of our binaries + for b in binaries: + import_metadata_into_db(b, session) + + session.commit() + # Move the .changes into the 'done' directory utils.move(self.pkg.changes_file, os.path.join(cnf["Dir::Queue::Done"], os.path.basename(self.pkg.changes_file))) diff --git a/tests/dbtest_packages.py b/tests/dbtest_packages.py index 2b179053..f2587709 100755 --- a/tests/dbtest_packages.py +++ b/tests/dbtest_packages.py @@ -328,7 +328,7 @@ class PackageTestCase(DBDakTestCase): 'sha1sum': 'deadbeef', 'sha256sum': 'deadbeef'} upload = Upload(pkg) - poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session) + bin, poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session) self.session.refresh(poolfile) self.session.refresh(poolfile.binary) self.assertEqual('main/h/hello/hello_2.2-2_i386.deb', poolfile.filename)