"Generate Packages/Sources files"),
("contents",
"Generate content files"),
+ ("metadata",
+ "Load data for packages/sources files"),
("generate-index-diffs",
"Generate .diff/Index files"),
("clean-suites",
#!/usr/bin/env python
"""
-Import data for Packages files from .deb files
-
-@contact: Debian FTPMaster <ftpmaster@debian.org>
-@copyright: 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
-@copyright: 2009 Mike O'Connor <stew@debian.org>
+Import data for Package/Sources files from .deb and .dsc files
@copyright: 2011 Torsten Werner <twerner@debian.org>
@copyright: 2011 Mark Hymers <mhy@debian.org>
@license: GNU General Public License version 2 or later
from daklib.config import Config
from daklib.dbconn import *
-from daklib.packages import PackagesScanner
+from daklib.metadata import MetadataScanner
from daklib import daklog
from daklib import utils
################################################################################
def usage (exit_code=0):
- print """Usage: dak packagescan [options] subcommand
+ print """Usage: dak metadata [options] subcommand
SUBCOMMANDS
- scan
- scan the debs in the existing pool and load metadata into the database
+ scan-source
+ scan the dsc files in the existing pool and load metadata into the database
+
+ scan-binary
+ scan the deb files in the existing pool and load metadata into the database
OPTIONS
-h, --help
OPTIONS for scan
-l, --limit=NUMBER
- maximum number of packages to scan
+ maximum number of items to scan
"""
sys.exit(exit_code)
################################################################################
-def scan_all(cnf, limit):
- Logger = daklog.Logger(cnf.Cnf, 'packages scan')
- result = PackagesScanner.scan_all(limit)
- processed = '%(processed)d packages processed' % result
- remaining = '%(remaining)d packages remaining' % result
+def scan_all(cnf, mode, limit):
+ Logger = daklog.Logger(cnf.Cnf, 'metadata scan (%s)' % mode)
+ result = MetadataScanner.scan_all(mode, limit)
+ processed = '%(processed)d %(type)s processed' % result
+ remaining = '%(remaining)d %(type)s remaining' % result
Logger.log([processed, remaining])
Logger.close()
def main():
cnf = Config()
- cnf['Packages::Options::Help'] = ''
- cnf['Packages::Options::Suite'] = ''
- cnf['Packages::Options::Limit'] = ''
- cnf['Packages::Options::Force'] = ''
- arguments = [('h', "help", 'Packages::Options::Help'),
- ('s', "suite", 'Packages::Options::Suite', "HasArg"),
- ('l', "limit", 'Packages::Options::Limit', "HasArg"),
- ('f', "force", 'Packages::Options::Force'),
+ cnf['Metadata::Options::Help'] = ''
+ cnf['Metadata::Options::Suite'] = ''
+ cnf['Metadata::Options::Limit'] = ''
+ cnf['Metadata::Options::Force'] = ''
+ arguments = [('h', "help", 'Metadata::Options::Help'),
+ ('s', "suite", 'Metadata::Options::Suite', "HasArg"),
+ ('l', "limit", 'Metadata::Options::Limit', "HasArg"),
+ ('f', "force", 'Metadata::Options::Force'),
]
args = apt_pkg.ParseCommandLine(cnf.Cnf, arguments, sys.argv)
- options = cnf.SubTree('Packages::Options')
+ options = cnf.SubTree('Metadata::Options')
if (len(args) != 1) or options['Help']:
usage()
if len(options['Limit']) > 0:
limit = int(options['Limit'])
- if args[0] == 'scan':
- scan_all(cnf, limit)
+ if args[0] == 'scan-source':
+ scan_all(cnf, 'source', limit)
+ return
+ elif args[0] == 'scan-binary':
+ scan_all(cnf, 'binary', limit)
return
suite_names = utils.split_args(options['Suite'])
self.poolfile = poolfile
self.binarytype = binarytype
+ @property
+ def pkid(self):
+ return self.binary_id
+
def properties(self):
return ['package', 'version', 'maintainer', 'source', 'architecture', \
'poolfile', 'binarytype', 'fingerprint', 'install_date', \
'''
Reads the control information from a binary.
- @rtype: tuple
- @return: (stanza, controldict) stanza is the text of the control
- section. controldict is the information in a dictionary
- form
+ @rtype: text
+ @return: stanza text of the control section.
'''
- import apt_inst, apt_pk
+ import apt_inst
fullpath = self.poolfile.fullpath
deb_file = open(fullpath, 'r')
- stanza = apt_inst.debExtractControl(deb_file).rstrip()
- control = dict(apt_pkg.TagSection(stanza))
+ stanza = apt_inst.debExtractControl(deb_file)
deb_file.close()
- return stanza, control
+ return stanza
+
+ def read_control_fields(self):
+ '''
+ Reads the control information from a binary and return
+ as a dictionary.
+ @rtype: dict
+ @return: fields of the control section as a dictionary.
+ '''
+ import apt_pkg
+ stanza = self.read_control()
+ return apt_pkg.TagSection(stanza)
__all__.append('DBBinary')
################################################################################
+from debian.debfile import Deb822
+
+# Temporary Deb822 subclass to fix bugs with : handling; see #597249
+class Dak822(Deb822):
+ def _internal_parser(self, sequence, fields=None):
+ # The key is non-whitespace, non-colon characters before any colon.
+ key_part = r"^(?P<key>[^: \t\n\r\f\v]+)\s*:\s*"
+ single = re.compile(key_part + r"(?P<data>\S.*?)\s*$")
+ multi = re.compile(key_part + r"$")
+ multidata = re.compile(r"^\s(?P<data>.+?)\s*$")
+
+ wanted_field = lambda f: fields is None or f in fields
+
+ if isinstance(sequence, basestring):
+ sequence = sequence.splitlines()
+
+ curkey = None
+ content = ""
+ for line in self.gpg_stripped_paragraph(sequence):
+ m = single.match(line)
+ if m:
+ if curkey:
+ self[curkey] = content
+
+ if not wanted_field(m.group('key')):
+ curkey = None
+ continue
+
+ curkey = m.group('key')
+ content = m.group('data')
+ continue
+
+ m = multi.match(line)
+ if m:
+ if curkey:
+ self[curkey] = content
+
+ if not wanted_field(m.group('key')):
+ curkey = None
+ continue
+
+ curkey = m.group('key')
+ content = ""
+ continue
+
+ m = multidata.match(line)
+ if m:
+ content += '\n' + line # XXX not m.group('data')?
+ continue
+
+ if curkey:
+ self[curkey] = content
+
+
class DBSource(ORMObject):
def __init__(self, source = None, version = None, maintainer = None, \
changedby = None, poolfile = None, install_date = None):
self.poolfile = poolfile
self.install_date = install_date
+ @property
+ def pkid(self):
+ return self.source_id
+
def properties(self):
return ['source', 'source_id', 'maintainer', 'changedby', \
'fingerprint', 'poolfile', 'version', 'suites_count', \
return ['source', 'version', 'install_date', 'maintainer', \
'changedby', 'poolfile', 'install_date']
- def read_control(self):
+ def read_control_fields(self):
'''
Reads the control information from a dsc
@rtype: tuple
- @return: (stanza, controldict) stanza is the text of the control
- section. controldict is the information in a dictionary
- form
+ @return: fields is the dsc information in a dictionary form
'''
- from debian.debfile import Deb822
fullpath = self.poolfile.fullpath
- fields = Deb822(open(self.poolfile.fullpath, 'r'))
+ fields = Dak822(open(self.poolfile.fullpath, 'r'))
return fields
metadata = association_proxy('key', 'value')
__all__.append('get_source_in_suite')
+@session_wrapper
+def import_metadata_into_db(obj, session=None):
+ """
+ This routine works on either DBBinary or DBSource objects and imports
+ their metadata into the database
+ """
+ fields = obj.read_control_fields()
+ for k in fields.keys():
+ try:
+ # Try raw ASCII
+ val = str(fields[k])
+ except UnicodeEncodeError:
+ # Fall back to UTF-8
+ try:
+ val = fields[k].encode('utf-8')
+ except UnicodeEncodeError:
+ # Finally try iso8859-1
+ val = fields[k].encode('iso8859-1')
+ # Otherwise we allow the exception to percolate up and we cause
+ # a reject as someone is playing silly buggers
+
+ obj.metadata[get_or_set_metadatakey(k, session)] = val
+
+ session.commit_or_flush()
+
+__all__.append('import_metadata_into_db')
+
+
################################################################################
@session_wrapper
# session.rollback()
# raise MissingContents, "No contents stored for package %s, and couldn't determine contents of %s" % (bin.package, filename)
- return poolfile
+ return bin, poolfile
__all__.append('add_deb_to_db')
__all__.append('MetadataKey')
+@session_wrapper
+def get_or_set_metadatakey(keyname, session=None):
+ """
+ Returns MetadataKey object for given uidname.
+
+ If no matching keyname is found, a row is inserted.
+
+ @type uidname: string
+ @param uidname: The keyname to add
+
+ @type session: SQLAlchemy
+ @param session: Optional SQL session object (a temporary one will be
+ generated if not supplied). If not passed, a commit will be performed at
+ the end of the function, otherwise the caller is responsible for commiting.
+
+ @rtype: MetadataKey
+ @return: the metadatakey object for the given keyname
+ """
+
+ q = session.query(MetadataKey).filter_by(key=keyname)
+
+ try:
+ ret = q.one()
+ except NoResultFound:
+ ret = MetadataKey(keyname)
+ session.add(ret)
+ session.commit_or_flush()
+
+ return ret
+
+__all__.append('get_or_set_metadatakey')
+
################################################################################
class BinaryMetadata(ORMObject):
#!/usr/bin/env python
"""
-Helper code for packages generation.
+Helper code for packages and sources generation.
@contact: Debian FTPMaster <ftpmaster@debian.org>
@copyright: 2011 Torsten Werner <twerner@debian.org>
import os.path
-class PackagesScanner(object):
+class MetadataScanner(object):
'''
- PackagesScanner provides a threadsafe method scan() to scan the metadata of
- a DBBinary object.
- '''
- def __init__(self, binary_id):
+ MetadataScanner provides a threadsafe method scan() to scan the metadata of
+ a DBSource or DBBinary object depending on what is passed as dbclass'''
+
+ def __init__(self, dbclass, pkid, verbose=True):
'''
The argument binary_id is the id of the DBBinary object that
+
should be scanned.
'''
- self.binary_id = binary_id
+ self.verbose = True
+ self.dbclass = dbclass
+ self.pkid = pkid
def scan(self, dummy_arg = None):
'''
property. It commits any changes to the database. The argument dummy_arg
is ignored but needed by our threadpool implementation.
'''
+ obj = None
+ fullpath = 'UNKNOWN PATH'
+
session = DBConn().session()
- binary = session.query(DBBinary).get(self.binary_id)
- fileset = set(binary.read_control())
- print fileset
- #if len(fileset) == 0:
- # fileset.add('EMPTY_PACKAGE')
- #for filename in fileset:
- # binary.contents.append(BinContents(file = filename))
- #session.commit()
+ try:
+ obj = session.query(self.dbclass).get(self.pkid)
+ fullpath = obj.poolfile.fullpath
+ import_metadata_into_db(obj, session=session)
+ if self.verbose:
+ print "Imported %s (%s)" % (self.pkid, fullpath)
+ session.commit()
+ except Exception, e:
+ print "Failed to import %s [id=%s; fullpath=%s]" % (self.dbclass.__name__, self.pkid, fullpath)
+ print "Exception: ", e
+ session.rollback()
+
session.close()
@classmethod
- def scan_all(class_, limit = None):
+ def scan_all(class_, scantype='source', limit = None):
'''
- The class method scan_all() scans all binaries using multiple threads.
- The number of binaries to be scanned can be limited with the limit
- argument. Returns the number of processed and remaining packages as a
+ The class method scan_all() scans all sources using multiple threads.
+ The number of sources to be scanned can be limited with the limit
+ argument. Returns the number of processed and remaining files as a
dict.
'''
session = DBConn().session()
- query = session.query(DBBinary).filter(DBBinary.contents == None)
+ if scantype == 'source':
+ dbclass = DBSource
+ query = session.query(DBSource).filter(~DBSource.source_id.in_(session.query(SourceMetadata.source_id.distinct())))
+ t = 'sources'
+ else:
+ # Otherwise binary
+ dbclass = DBBinary
+ query = session.query(DBBinary).filter(~DBBinary.binary_id.in_(session.query(BinaryMetadata.binary_id.distinct())))
+ t = 'binaries'
+
remaining = query.count
if limit is not None:
query = query.limit(limit)
processed = query.count()
- pool = Pool()
- for binary in query.yield_per(100):
- pool.apply_async(scan_helper, (binary.binary_id, ))
+ pool = Pool(processes=10)
+ for obj in query.yield_per(100):
+ pool.apply_async(scan_helper, (dbclass, obj.pkid, ))
pool.close()
pool.join()
remaining = remaining()
session.close()
- return { 'processed': processed, 'remaining': remaining }
+ return { 'processed': processed, 'remaining': remaining , 'type': t}
-def scan_helper(binary_id):
+def scan_helper(dbclass, source_id):
'''
This function runs in a subprocess.
'''
- scanner = PackagesScanner(binary_id)
+ scanner = MetadataScanner(dbclass, source_id)
scanner.scan()
print "Installing."
self.logger.log(["installing changes", self.pkg.changes_file])
+ binaries = []
poolfiles = []
# Add the .dsc file to the DB first
# Add .deb / .udeb files to the DB (type is always deb, dbtype is udeb/deb)
for newfile, entry in self.pkg.files.items():
if entry["type"] == "deb":
- poolfiles.append(add_deb_to_db(self, newfile, session))
+ b, pf = add_deb_to_db(self, newfile, session)
+ binaries.append(b)
+ poolfiles.append(pf)
# If this is a sourceful diff only upload that is moving
# cross-component we need to copy the .orig files into the new
# Our SQL session will automatically start a new transaction after
# the last commit
+ # Now ensure that the metadata has been added
+ # This has to be done after we copy the files into the pool
+ # For source if we have it:
+ if self.pkg.changes["architecture"].has_key("source"):
+ import_metadata_into_db(source, session)
+
+ # Now for any of our binaries
+ for b in binaries:
+ import_metadata_into_db(b, session)
+
+ session.commit()
+
# Move the .changes into the 'done' directory
utils.move(self.pkg.changes_file,
os.path.join(cnf["Dir::Queue::Done"], os.path.basename(self.pkg.changes_file)))
'sha1sum': 'deadbeef',
'sha256sum': 'deadbeef'}
upload = Upload(pkg)
- poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session)
+ bin, poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session)
self.session.refresh(poolfile)
self.session.refresh(poolfile.binary)
self.assertEqual('main/h/hello/hello_2.2-2_i386.deb', poolfile.filename)