]> err.no Git - dak/commitdiff
metadata generation work
authorMark Hymers <mhy@debian.org>
Wed, 23 Mar 2011 18:34:24 +0000 (18:34 +0000)
committerMark Hymers <mhy@debian.org>
Wed, 23 Mar 2011 18:34:24 +0000 (18:34 +0000)
Signed-off-by: Mark Hymers <mhy@debian.org>
dak/dak.py
dak/metadata.py [moved from dak/packagescan.py with 65% similarity]
daklib/dbconn.py
daklib/metadata.py [moved from daklib/packages.py with 55% similarity]
daklib/queue.py
tests/dbtest_packages.py

index 5a659d8c5a157d6cbde4be4980c549fec4b73e2b..ad99a5a06c80b5c4e383dad854583d5ae67b2a15 100755 (executable)
@@ -86,6 +86,8 @@ def init():
          "Generate Packages/Sources files"),
         ("contents",
          "Generate content files"),
+        ("metadata",
+         "Load data for packages/sources files"),
         ("generate-index-diffs",
          "Generate .diff/Index files"),
         ("clean-suites",
similarity index 65%
rename from dak/packagescan.py
rename to dak/metadata.py
index 2d2bab0057cbfad6a76cdee002521c1d397c9ec2..f40c9431557599951903a52c01bb07d6130f88ac 100755 (executable)
@@ -1,10 +1,6 @@
 #!/usr/bin/env python
 """
-Import data for Packages files from .deb files
-
-@contact: Debian FTPMaster <ftpmaster@debian.org>
-@copyright: 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
-@copyright: 2009 Mike O'Connor <stew@debian.org>
+Import data for Package/Sources files from .deb and .dsc files
 @copyright: 2011 Torsten Werner <twerner@debian.org>
 @copyright: 2011 Mark Hymers <mhy@debian.org>
 @license: GNU General Public License version 2 or later
@@ -41,18 +37,21 @@ import apt_pkg
 
 from daklib.config import Config
 from daklib.dbconn import *
-from daklib.packages import PackagesScanner
+from daklib.metadata import MetadataScanner
 from daklib import daklog
 from daklib import utils
 
 ################################################################################
 
 def usage (exit_code=0):
-    print """Usage: dak packagescan [options] subcommand
+    print """Usage: dak metadata [options] subcommand
 
 SUBCOMMANDS
-    scan
-        scan the debs in the existing pool and load metadata into the database
+    scan-source
+        scan the dsc files in the existing pool and load metadata into the database
+
+    scan-binary
+        scan the deb files in the existing pool and load metadata into the database
 
 OPTIONS
      -h, --help
@@ -60,17 +59,17 @@ OPTIONS
 
 OPTIONS for scan
      -l, --limit=NUMBER
-        maximum number of packages to scan
+        maximum number of items to scan
 """
     sys.exit(exit_code)
 
 ################################################################################
 
-def scan_all(cnf, limit):
-    Logger = daklog.Logger(cnf.Cnf, 'packages scan')
-    result = PackagesScanner.scan_all(limit)
-    processed = '%(processed)d packages processed' % result
-    remaining = '%(remaining)d packages remaining' % result
+def scan_all(cnf, mode, limit):
+    Logger = daklog.Logger(cnf.Cnf, 'metadata scan (%s)' % mode)
+    result = MetadataScanner.scan_all(mode, limit)
+    processed = '%(processed)d %(type)s processed' % result
+    remaining = '%(remaining)d %(type)s remaining' % result
     Logger.log([processed, remaining])
     Logger.close()
 
@@ -78,17 +77,17 @@ def scan_all(cnf, limit):
 
 def main():
     cnf = Config()
-    cnf['Packages::Options::Help'] = ''
-    cnf['Packages::Options::Suite'] = ''
-    cnf['Packages::Options::Limit'] = ''
-    cnf['Packages::Options::Force'] = ''
-    arguments = [('h', "help",  'Packages::Options::Help'),
-                 ('s', "suite", 'Packages::Options::Suite', "HasArg"),
-                 ('l', "limit", 'Packages::Options::Limit', "HasArg"),
-                 ('f', "force", 'Packages::Options::Force'),
+    cnf['Metadata::Options::Help'] = ''
+    cnf['Metadata::Options::Suite'] = ''
+    cnf['Metadata::Options::Limit'] = ''
+    cnf['Metadata::Options::Force'] = ''
+    arguments = [('h', "help",  'Metadata::Options::Help'),
+                 ('s', "suite", 'Metadata::Options::Suite', "HasArg"),
+                 ('l', "limit", 'Metadata::Options::Limit', "HasArg"),
+                 ('f', "force", 'Metadata::Options::Force'),
                 ]
     args = apt_pkg.ParseCommandLine(cnf.Cnf, arguments, sys.argv)
-    options = cnf.SubTree('Packages::Options')
+    options = cnf.SubTree('Metadata::Options')
 
     if (len(args) != 1) or options['Help']:
         usage()
@@ -97,8 +96,11 @@ def main():
     if len(options['Limit']) > 0:
         limit = int(options['Limit'])
 
-    if args[0] == 'scan':
-        scan_all(cnf, limit)
+    if args[0] == 'scan-source':
+        scan_all(cnf, 'source', limit)
+        return
+    elif args[0] == 'scan-binary':
+        scan_all(cnf, 'binary', limit)
         return
 
     suite_names = utils.split_args(options['Suite'])
index 4d30e663b994fee443f87aa120dbca0fb9bc1a03..98b6c7d50eb6a23affaa1f3453e485055ccb7e53 100755 (executable)
@@ -492,6 +492,10 @@ class DBBinary(ORMObject):
         self.poolfile = poolfile
         self.binarytype = binarytype
 
+    @property
+    def pkid(self):
+        return self.binary_id
+
     def properties(self):
         return ['package', 'version', 'maintainer', 'source', 'architecture', \
             'poolfile', 'binarytype', 'fingerprint', 'install_date', \
@@ -533,20 +537,28 @@ class DBBinary(ORMObject):
         '''
         Reads the control information from a binary.
 
-        @rtype: tuple
-        @return: (stanza, controldict)  stanza is the text of the control
-                 section.  controldict is the information in a dictionary
-                 form
+        @rtype: text
+        @return: stanza text of the control section.
         '''
-        import apt_inst, apt_pk
+        import apt_inst
         fullpath = self.poolfile.fullpath
         deb_file = open(fullpath, 'r')
-        stanza = apt_inst.debExtractControl(deb_file).rstrip()
-        control = dict(apt_pkg.TagSection(stanza))
+        stanza = apt_inst.debExtractControl(deb_file)
         deb_file.close()
 
-        return stanza, control
+        return stanza
+
+    def read_control_fields(self):
+        '''
+        Reads the control information from a binary and return
+        as a dictionary.
 
+        @rtype: dict
+        @return: fields of the control section as a dictionary.
+        '''
+        import apt_pkg
+        stanza = self.read_control()
+        return apt_pkg.TagSection(stanza)
 
 __all__.append('DBBinary')
 
@@ -2176,6 +2188,60 @@ __all__.append('get_sections')
 
 ################################################################################
 
+from debian.debfile import Deb822
+
+# Temporary Deb822 subclass to fix bugs with : handling; see #597249
+class Dak822(Deb822):
+    def _internal_parser(self, sequence, fields=None):
+        # The key is non-whitespace, non-colon characters before any colon.
+        key_part = r"^(?P<key>[^: \t\n\r\f\v]+)\s*:\s*"
+        single = re.compile(key_part + r"(?P<data>\S.*?)\s*$")
+        multi = re.compile(key_part + r"$")
+        multidata = re.compile(r"^\s(?P<data>.+?)\s*$")
+
+        wanted_field = lambda f: fields is None or f in fields
+
+        if isinstance(sequence, basestring):
+            sequence = sequence.splitlines()
+
+        curkey = None
+        content = ""
+        for line in self.gpg_stripped_paragraph(sequence):
+            m = single.match(line)
+            if m:
+                if curkey:
+                    self[curkey] = content
+
+                if not wanted_field(m.group('key')):
+                    curkey = None
+                    continue
+
+                curkey = m.group('key')
+                content = m.group('data')
+                continue
+
+            m = multi.match(line)
+            if m:
+                if curkey:
+                    self[curkey] = content
+
+                if not wanted_field(m.group('key')):
+                    curkey = None
+                    continue
+
+                curkey = m.group('key')
+                content = ""
+                continue
+
+            m = multidata.match(line)
+            if m:
+                content += '\n' + line # XXX not m.group('data')?
+                continue
+
+        if curkey:
+            self[curkey] = content
+
+
 class DBSource(ORMObject):
     def __init__(self, source = None, version = None, maintainer = None, \
         changedby = None, poolfile = None, install_date = None):
@@ -2186,6 +2252,10 @@ class DBSource(ORMObject):
         self.poolfile = poolfile
         self.install_date = install_date
 
+    @property
+    def pkid(self):
+        return self.source_id
+
     def properties(self):
         return ['source', 'source_id', 'maintainer', 'changedby', \
             'fingerprint', 'poolfile', 'version', 'suites_count', \
@@ -2195,18 +2265,15 @@ class DBSource(ORMObject):
         return ['source', 'version', 'install_date', 'maintainer', \
             'changedby', 'poolfile', 'install_date']
 
-    def read_control(self):
+    def read_control_fields(self):
         '''
         Reads the control information from a dsc
 
         @rtype: tuple
-        @return: (stanza, controldict)  stanza is the text of the control
-                 section.  controldict is the information in a dictionary
-                 form
+        @return: fields is the dsc information in a dictionary form
         '''
-        from debian.debfile import Deb822
         fullpath = self.poolfile.fullpath
-        fields = Deb822(open(self.poolfile.fullpath, 'r'))
+        fields = Dak822(open(self.poolfile.fullpath, 'r'))
         return fields
 
     metadata = association_proxy('key', 'value')
@@ -2354,6 +2421,34 @@ def get_source_in_suite(source, suite, session=None):
 
 __all__.append('get_source_in_suite')
 
+@session_wrapper
+def import_metadata_into_db(obj, session=None):
+    """
+    This routine works on either DBBinary or DBSource objects and imports
+    their metadata into the database
+    """
+    fields = obj.read_control_fields()
+    for k in fields.keys():
+        try:
+            # Try raw ASCII
+            val = str(fields[k])
+        except UnicodeEncodeError:
+            # Fall back to UTF-8
+            try:
+                val = fields[k].encode('utf-8')
+            except UnicodeEncodeError:
+                # Finally try iso8859-1
+                val = fields[k].encode('iso8859-1')
+                # Otherwise we allow the exception to percolate up and we cause
+                # a reject as someone is playing silly buggers
+
+        obj.metadata[get_or_set_metadatakey(k, session)] = val
+
+    session.commit_or_flush()
+
+__all__.append('import_metadata_into_db')
+
+
 ################################################################################
 
 @session_wrapper
@@ -2530,7 +2625,7 @@ def add_deb_to_db(u, filename, session=None):
     #    session.rollback()
     #    raise MissingContents, "No contents stored for package %s, and couldn't determine contents of %s" % (bin.package, filename)
 
-    return poolfile
+    return bin, poolfile
 
 __all__.append('add_deb_to_db')
 
@@ -2853,6 +2948,38 @@ class MetadataKey(ORMObject):
 
 __all__.append('MetadataKey')
 
+@session_wrapper
+def get_or_set_metadatakey(keyname, session=None):
+    """
+    Returns MetadataKey object for given uidname.
+
+    If no matching keyname is found, a row is inserted.
+
+    @type uidname: string
+    @param uidname: The keyname to add
+
+    @type session: SQLAlchemy
+    @param session: Optional SQL session object (a temporary one will be
+    generated if not supplied).  If not passed, a commit will be performed at
+    the end of the function, otherwise the caller is responsible for commiting.
+
+    @rtype: MetadataKey
+    @return: the metadatakey object for the given keyname
+    """
+
+    q = session.query(MetadataKey).filter_by(key=keyname)
+
+    try:
+        ret = q.one()
+    except NoResultFound:
+        ret = MetadataKey(keyname)
+        session.add(ret)
+        session.commit_or_flush()
+
+    return ret
+
+__all__.append('get_or_set_metadatakey')
+
 ################################################################################
 
 class BinaryMetadata(ORMObject):
similarity index 55%
rename from daklib/packages.py
rename to daklib/metadata.py
index 27b6d287534fbc8c1569ba4ea3553da7aaf5feee..d88cf4faf2a1ad0cd88675a182858c76c836cf12 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 """
-Helper code for packages generation.
+Helper code for packages and sources generation.
 
 @contact: Debian FTPMaster <ftpmaster@debian.org>
 @copyright: 2011 Torsten Werner <twerner@debian.org>
@@ -34,17 +34,20 @@ from subprocess import Popen, PIPE
 
 import os.path
 
-class PackagesScanner(object):
+class MetadataScanner(object):
     '''
-    PackagesScanner provides a threadsafe method scan() to scan the metadata of
-    a DBBinary object.
-    '''
-    def __init__(self, binary_id):
+    MetadataScanner provides a threadsafe method scan() to scan the metadata of
+    a DBSource or DBBinary object depending on what is passed as dbclass'''
+
+    def __init__(self, dbclass, pkid, verbose=True):
         '''
         The argument binary_id is the id of the DBBinary object that
+
         should be scanned.
         '''
-        self.binary_id = binary_id
+        self.verbose = True
+        self.dbclass = dbclass
+        self.pkid = pkid
 
     def scan(self, dummy_arg = None):
         '''
@@ -52,43 +55,59 @@ class PackagesScanner(object):
         property. It commits any changes to the database. The argument dummy_arg
         is ignored but needed by our threadpool implementation.
         '''
+        obj = None
+        fullpath = 'UNKNOWN PATH'
+
         session = DBConn().session()
-        binary = session.query(DBBinary).get(self.binary_id)
-        fileset = set(binary.read_control())
-        print fileset
-        #if len(fileset) == 0:
-        #    fileset.add('EMPTY_PACKAGE')
-        #for filename in fileset:
-        #    binary.contents.append(BinContents(file = filename))
-        #session.commit()
+        try:
+            obj = session.query(self.dbclass).get(self.pkid)
+            fullpath = obj.poolfile.fullpath
+            import_metadata_into_db(obj, session=session)
+            if self.verbose:
+                print "Imported %s (%s)" % (self.pkid, fullpath)
+            session.commit()
+        except Exception, e:
+            print "Failed to import %s [id=%s; fullpath=%s]" % (self.dbclass.__name__, self.pkid, fullpath)
+            print "Exception: ", e
+            session.rollback()
+
         session.close()
 
     @classmethod
-    def scan_all(class_, limit = None):
+    def scan_all(class_, scantype='source', limit = None):
         '''
-        The class method scan_all() scans all binaries using multiple threads.
-        The number of binaries to be scanned can be limited with the limit
-        argument. Returns the number of processed and remaining packages as a
+        The class method scan_all() scans all sources using multiple threads.
+        The number of sources to be scanned can be limited with the limit
+        argument. Returns the number of processed and remaining files as a
         dict.
         '''
         session = DBConn().session()
-        query = session.query(DBBinary).filter(DBBinary.contents == None)
+        if scantype == 'source':
+            dbclass = DBSource
+            query = session.query(DBSource).filter(~DBSource.source_id.in_(session.query(SourceMetadata.source_id.distinct())))
+            t = 'sources'
+        else:
+            # Otherwise binary
+            dbclass = DBBinary
+            query = session.query(DBBinary).filter(~DBBinary.binary_id.in_(session.query(BinaryMetadata.binary_id.distinct())))
+            t = 'binaries'
+
         remaining = query.count
         if limit is not None:
             query = query.limit(limit)
         processed = query.count()
-        pool = Pool()
-        for binary in query.yield_per(100):
-            pool.apply_async(scan_helper, (binary.binary_id, ))
+        pool = Pool(processes=10)
+        for obj in query.yield_per(100):
+            pool.apply_async(scan_helper, (dbclass, obj.pkid, ))
         pool.close()
         pool.join()
         remaining = remaining()
         session.close()
-        return { 'processed': processed, 'remaining': remaining }
+        return { 'processed': processed, 'remaining': remaining , 'type': t}
 
-def scan_helper(binary_id):
+def scan_helper(dbclass, source_id):
     '''
     This function runs in a subprocess.
     '''
-    scanner = PackagesScanner(binary_id)
+    scanner = MetadataScanner(dbclass, source_id)
     scanner.scan()
index b4c62d38a204ded115d6bd7738e1913ffe02438a..52483cca4bd8dc0d53b2f6b95670f250ef3d2b9e 100755 (executable)
@@ -2025,6 +2025,7 @@ distribution."""
         print "Installing."
         self.logger.log(["installing changes", self.pkg.changes_file])
 
+        binaries = []
         poolfiles = []
 
         # Add the .dsc file to the DB first
@@ -2037,7 +2038,9 @@ distribution."""
         # Add .deb / .udeb files to the DB (type is always deb, dbtype is udeb/deb)
         for newfile, entry in self.pkg.files.items():
             if entry["type"] == "deb":
-                poolfiles.append(add_deb_to_db(self, newfile, session))
+                b, pf = add_deb_to_db(self, newfile, session)
+                binaries.append(b)
+                poolfiles.append(pf)
 
         # If this is a sourceful diff only upload that is moving
         # cross-component we need to copy the .orig files into the new
@@ -2122,6 +2125,18 @@ distribution."""
         # Our SQL session will automatically start a new transaction after
         # the last commit
 
+        # Now ensure that the metadata has been added
+        # This has to be done after we copy the files into the pool
+        # For source if we have it:
+        if self.pkg.changes["architecture"].has_key("source"):
+            import_metadata_into_db(source, session)
+
+        # Now for any of our binaries
+        for b in binaries:
+            import_metadata_into_db(b, session)
+
+        session.commit()
+
         # Move the .changes into the 'done' directory
         utils.move(self.pkg.changes_file,
                    os.path.join(cnf["Dir::Queue::Done"], os.path.basename(self.pkg.changes_file)))
index 2b17905305c8df7314df27207ddd9e5f2175216c..f258770983edbc6a7cc32459fda49c5b6e864833 100755 (executable)
@@ -328,7 +328,7 @@ class PackageTestCase(DBDakTestCase):
             'sha1sum': 'deadbeef',
             'sha256sum': 'deadbeef'}
         upload = Upload(pkg)
-        poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session)
+        bin, poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session)
         self.session.refresh(poolfile)
         self.session.refresh(poolfile.binary)
         self.assertEqual('main/h/hello/hello_2.2-2_i386.deb', poolfile.filename)