diff options
Diffstat (limited to 'cvs2svn_lib/collect_data.py')
-rw-r--r-- | cvs2svn_lib/collect_data.py | 1431 |
1 files changed, 1431 insertions, 0 deletions
diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py new file mode 100644 index 0000000..160d7b9 --- /dev/null +++ b/cvs2svn_lib/collect_data.py @@ -0,0 +1,1431 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Data collection classes. + +This module contains the code used to collect data from the CVS +repository. It parses *,v files, recording all useful information +except for the actual file contents (though even the file contents +might be recorded by the RevisionRecorder if one is configured). + +As a *,v file is parsed, the information pertaining to the file is +accumulated in memory, mostly in _RevisionData, _BranchData, and +_TagData objects. When parsing is complete, a final pass is made over +the data to create some final dependency links, collect statistics, +etc., then the _*Data objects are converted into CVSItem objects +(CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are +dumped into databases. + +During the data collection, persistent unique ids are allocated to +many types of objects: CVSFile, Symbol, and CVSItems. CVSItems are a +special case. CVSItem ids are unique across all CVSItem types, and +the ids are carried over from the corresponding data collection +objects: + + _RevisionData -> CVSRevision + + _BranchData -> CVSBranch + + _TagData -> CVSTag + +In a later pass it is possible to convert tags <-> branches. But even +if this occurs, the new branch or tag uses the same id as the old tag +or branch. + +""" + + +import os +import stat +import re + +from cvs2svn_lib import config +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.common import IllegalSVNPathError +from cvs2svn_lib.common import verify_svn_filename_legal +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.project import FileInAndOutOfAtticException +from cvs2svn_lib.cvs_file import CVSPath +from cvs2svn_lib.cvs_file import CVSDirectory +from cvs2svn_lib.cvs_file import CVSFile +from cvs2svn_lib.symbol import Symbol +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.cvs_item import CVSRevision +from cvs2svn_lib.cvs_item import CVSBranch +from cvs2svn_lib.cvs_item import CVSTag +from cvs2svn_lib.cvs_item import cvs_revision_type_map +from cvs2svn_lib.cvs_file_items import VendorBranchError +from cvs2svn_lib.cvs_file_items import CVSFileItems +from cvs2svn_lib.key_generator import KeyGenerator +from cvs2svn_lib.cvs_item_database import NewCVSItemStore +from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector +from cvs2svn_lib.metadata_database import MetadataDatabase +from cvs2svn_lib.metadata_database import MetadataLogger + +import cvs2svn_rcsparse + + +# A regular expression defining "valid" revision numbers (used to +# check that symbol definitions are reasonable). +_valid_revision_re = re.compile(r''' + ^ + (?:\d+\.)+ # Digit groups with trailing dots + \d+ # And the last digit group. + $ + ''', re.VERBOSE) + +_branch_revision_re = re.compile(r''' + ^ + ((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot + (?:0\.)? # CVS sticks an extra 0 here; RCS does not + (\d+) # And the last digit group + $ + ''', re.VERBOSE) + + +def rev_tuple(rev): + """Return a tuple of integers corresponding to revision number REV. + + For example, if REV is '1.2.3.4', then return (1,2,3,4).""" + + return tuple([int(x) for x in rev.split('.')]) + + +def is_trunk_revision(rev): + """Return True iff REV is a trunk revision. + + REV is a revision number corresponding to a specific revision (i.e., + not a whole branch).""" + + return rev.count('.') == 1 + + +def is_branch_revision_number(rev): + """Return True iff REV is a branch revision number. + + REV is a CVS revision number in canonical form (i.e., with zeros + removed). Return True iff it refers to a whole branch, as opposed + to a single revision.""" + + return rev.count('.') % 2 == 0 + + +def is_same_line_of_development(rev1, rev2): + """Return True if rev1 and rev2 are on the same line of + development (i.e., both on trunk, or both on the same branch); + return False otherwise. Either rev1 or rev2 can be None, in + which case automatically return False.""" + + if rev1 is None or rev2 is None: + return False + if rev1.count('.') == 1 and rev2.count('.') == 1: + return True + if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]: + return True + return False + + +class _RevisionData: + """We track the state of each revision so that in set_revision_info, + we can determine if our op is an add/change/delete. We can do this + because in set_revision_info, we'll have all of the _RevisionData + for a file at our fingertips, and we need to examine the state of + our prev_rev to determine if we're an add or a change. Without the + state of the prev_rev, we are unable to distinguish between an add + and a change.""" + + def __init__(self, cvs_rev_id, rev, timestamp, author, state): + # The id of this revision: + self.cvs_rev_id = cvs_rev_id + self.rev = rev + self.timestamp = timestamp + self.author = author + self.original_timestamp = timestamp + self.state = state + + # If this is the first revision on a branch, then this is the + # branch_data of that branch; otherwise it is None. + self.parent_branch_data = None + + # The revision number of the parent of this revision along the + # same line of development, if any. For the first revision R on a + # branch, we consider the revision from which R sprouted to be the + # 'parent'. If this is the root revision in the file's revision + # tree, then this field is None. + # + # Note that this revision can't be determined arithmetically (due + # to cvsadmin -o), which is why this field is necessary. + self.parent = None + + # The revision number of the primary child of this revision (the + # child along the same line of development), if any; otherwise, + # None. + self.child = None + + # The _BranchData instances of branches that sprout from this + # revision, sorted in ascending order by branch number. It would + # be inconvenient to initialize it here because we would have to + # scan through all branches known by the _SymbolDataCollector to + # find the ones having us as the parent. Instead, this + # information is filled in by + # _FileDataCollector._resolve_dependencies() and sorted by + # _FileDataCollector._sort_branches(). + self.branches_data = [] + + # The revision numbers of the first commits on any branches on + # which commits occurred. This dependency is kept explicitly + # because otherwise a revision-only topological sort would miss + # the dependency that exists via branches_data. + self.branches_revs_data = [] + + # The _TagData instances of tags that are connected to this + # revision. + self.tags_data = [] + + # A token that may be returned from + # RevisionRecorder.record_text(). It can be used by + # RevisionReader to obtain the text again. + self.revision_recorder_token = None + + def get_first_on_branch_id(self): + return self.parent_branch_data and self.parent_branch_data.id + + +class _SymbolData: + """Collection area for information about a symbol in a single CVSFile. + + SYMBOL is an instance of Symbol, undifferentiated as a Branch or a + Tag regardless of whether self is a _BranchData or a _TagData.""" + + def __init__(self, id, symbol): + """Initialize an object for SYMBOL.""" + + # The unique id that will be used for this particular symbol in + # this particular file. This same id will be used for the CVSItem + # that is derived from this instance. + self.id = id + + # An instance of Symbol. + self.symbol = symbol + + +class _BranchData(_SymbolData): + """Collection area for information about a Branch in a single CVSFile.""" + + def __init__(self, id, symbol, branch_number): + _SymbolData.__init__(self, id, symbol) + + # The branch number (e.g., '1.5.2') of this branch. + self.branch_number = branch_number + + # The revision number of the revision from which this branch + # sprouts (e.g., '1.5'). + self.parent = self.branch_number[:self.branch_number.rindex(".")] + + # The revision number of the first commit on this branch, if any + # (e.g., '1.5.2.1'); otherwise, None. + self.child = None + + +class _TagData(_SymbolData): + """Collection area for information about a Tag in a single CVSFile.""" + + def __init__(self, id, symbol, rev): + _SymbolData.__init__(self, id, symbol) + + # The revision number being tagged (e.g., '1.5.2.3'). + self.rev = rev + + +class _SymbolDataCollector(object): + """Collect information about symbols in a single CVSFile.""" + + def __init__(self, fdc, cvs_file): + self.fdc = fdc + self.cvs_file = cvs_file + + self.pdc = self.fdc.pdc + self.collect_data = self.fdc.collect_data + + # A list [(name, revision), ...] of symbols defined in the header + # of the file. The name has already been transformed using the + # symbol transform rules. If the symbol transform rules indicate + # that the symbol should be ignored, then it is never added to + # this list. This list is processed then deleted in + # process_symbols(). + self._symbol_defs = [] + + # A set containing the transformed names of symbols in this file + # (used to detect duplicats during processing of unlabeled + # branches): + self._defined_symbols = set() + + # Map { branch_number : _BranchData }, where branch_number has an + # odd number of digits. + self.branches_data = { } + + # Map { revision : [ tag_data ] }, where revision has an even + # number of digits, and the value is a list of _TagData objects + # for tags that apply to that revision. + self.tags_data = { } + + def _add_branch(self, name, branch_number): + """Record that BRANCH_NUMBER is the branch number for branch NAME, + and derive and record the revision from which NAME sprouts. + BRANCH_NUMBER is an RCS branch number with an odd number of + components, for example '1.7.2' (never '1.7.0.2'). Return the + _BranchData instance (which is usually newly-created).""" + + branch_data = self.branches_data.get(branch_number) + + if branch_data is not None: + Log().warn( + "%s: in '%s':\n" + " branch '%s' already has name '%s',\n" + " cannot also have name '%s', ignoring the latter\n" + % (warning_prefix, + self.cvs_file.filename, branch_number, + branch_data.symbol.name, name) + ) + return branch_data + + symbol = self.pdc.get_symbol(name) + branch_data = _BranchData( + self.collect_data.item_key_generator.gen_id(), symbol, branch_number + ) + self.branches_data[branch_number] = branch_data + return branch_data + + def _construct_distinct_name(self, name, original_name): + """Construct a distinct symbol name from NAME. + + If NAME is distinct, return it. If it is already used in this + file (as determined from its presence in self._defined_symbols), + construct and return a new name that is not already used.""" + + if name not in self._defined_symbols: + return name + else: + index = 1 + while True: + dup_name = '%s-DUPLICATE-%d' % (name, index,) + if dup_name not in self._defined_symbols: + self.collect_data.record_fatal_error( + "Symbol name '%s' is already used in '%s'.\n" + "The unlabeled branch '%s' must be renamed using " + "--symbol-transform." + % (name, self.cvs_file.filename, original_name,) + ) + return dup_name + + def _add_unlabeled_branch(self, branch_number): + original_name = "unlabeled-" + branch_number + name = self.transform_symbol(original_name, branch_number) + if name is None: + self.collect_data.record_fatal_error( + "The unlabeled branch '%s' in '%s' contains commits.\n" + "It may not be ignored via a symbol transform. (Use --exclude " + "instead.)" + % (original_name, self.cvs_file.filename,) + ) + # Retain the original name to allow the conversion to continue: + name = original_name + + distinct_name = self._construct_distinct_name(name, original_name) + self._defined_symbols.add(distinct_name) + return self._add_branch(distinct_name, branch_number) + + def _add_tag(self, name, revision): + """Record that tag NAME refers to the specified REVISION.""" + + symbol = self.pdc.get_symbol(name) + tag_data = _TagData( + self.collect_data.item_key_generator.gen_id(), symbol, revision + ) + self.tags_data.setdefault(revision, []).append(tag_data) + return tag_data + + def transform_symbol(self, name, revision): + """Transform a symbol according to the project's symbol transforms. + + Transform the symbol with the original name NAME and canonicalized + revision number REVISION. Return the new symbol name or None if + the symbol should be ignored entirely. + + Log the results of the symbol transform if necessary.""" + + old_name = name + # Apply any user-defined symbol transforms to the symbol name: + name = self.cvs_file.project.transform_symbol( + self.cvs_file, name, revision + ) + + if name is None: + # Ignore symbol: + self.pdc.log_symbol_transform(old_name, None) + Log().verbose( + " symbol '%s'=%s ignored in %s" + % (old_name, revision, self.cvs_file.filename,) + ) + else: + if name != old_name: + self.pdc.log_symbol_transform(old_name, name) + Log().verbose( + " symbol '%s'=%s transformed to '%s' in %s" + % (old_name, revision, name, self.cvs_file.filename,) + ) + + return name + + def define_symbol(self, name, revision): + """Record a symbol definition for later processing.""" + + # Canonicalize the revision number: + revision = _branch_revision_re.sub(r'\1\2', revision) + + # Apply any user-defined symbol transforms to the symbol name: + name = self.transform_symbol(name, revision) + + if name is not None: + # Verify that the revision number is valid: + if _valid_revision_re.match(revision): + # The revision number is valid; record it for later processing: + self._symbol_defs.append( (name, revision) ) + else: + Log().warn( + 'In %r:\n' + ' branch %r references invalid revision %s\n' + ' and will be ignored.' + % (self.cvs_file.filename, name, revision,) + ) + + def _eliminate_trivial_duplicate_defs(self, symbol_defs): + """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions. + + Duplicate definitions of symbol names have been seen in the wild, + and they can also happen when --symbol-transform is used. If a + symbol is defined to the same revision number repeatedly, then + ignore all but the last definition.""" + + # Make a copy, since we have to iterate through the definitions + # twice: + symbol_defs = list(symbol_defs) + + # A map { (name, revision) : [index,...] } of the indexes where + # symbol definitions name=revision were found: + known_definitions = {} + for (i, symbol_def) in enumerate(symbol_defs): + known_definitions.setdefault(symbol_def, []).append(i) + + # A set of the indexes of entries that have to be removed from + # symbol_defs: + dup_indexes = set() + for ((name, revision), indexes) in known_definitions.iteritems(): + if len(indexes) > 1: + Log().verbose( + "in %r:\n" + " symbol %s:%s defined multiple times; ignoring duplicates\n" + % (self.cvs_file.filename, name, revision,) + ) + dup_indexes.update(indexes[:-1]) + + for (i, symbol_def) in enumerate(symbol_defs): + if i not in dup_indexes: + yield symbol_def + + def _process_duplicate_defs(self, symbol_defs): + """Iterate through SYMBOL_DEFS, processing duplicate names. + + Duplicate definitions of symbol names have been seen in the wild, + and they can also happen when --symbol-transform is used. If a + symbol is defined multiple times, then it is a fatal error. This + method should be called after _eliminate_trivial_duplicate_defs().""" + + # Make a copy, since we have to access multiple times: + symbol_defs = list(symbol_defs) + + # A map {name : [index,...]} mapping the names of symbols to a + # list of their definitions' indexes in symbol_defs: + known_symbols = {} + for (i, (name, revision)) in enumerate(symbol_defs): + known_symbols.setdefault(name, []).append(i) + + known_symbols = known_symbols.items() + known_symbols.sort() + dup_indexes = set() + for (name, indexes) in known_symbols: + if len(indexes) > 1: + # This symbol was defined multiple times. + self.collect_data.record_fatal_error( + "Multiple definitions of the symbol '%s' in '%s': %s" % ( + name, self.cvs_file.filename, + ' '.join([symbol_defs[i][1] for i in indexes]), + ) + ) + # Ignore all but the last definition for now, to allow the + # conversion to proceed: + dup_indexes.update(indexes[:-1]) + + for (i, symbol_def) in enumerate(symbol_defs): + if i not in dup_indexes: + yield symbol_def + + def _process_symbol(self, name, revision): + """Process a symbol called NAME, which is associated with REVISON. + + REVISION is a canonical revision number with zeros removed, for + example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'. NAME is a + transformed branch or tag name.""" + + # Add symbol to our records: + if is_branch_revision_number(revision): + self._add_branch(name, revision) + else: + self._add_tag(name, revision) + + def process_symbols(self): + """Process the symbol definitions from SELF._symbol_defs.""" + + symbol_defs = self._symbol_defs + del self._symbol_defs + + symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs) + symbol_defs = self._process_duplicate_defs(symbol_defs) + + for (name, revision) in symbol_defs: + self._defined_symbols.add(name) + self._process_symbol(name, revision) + + @staticmethod + def rev_to_branch_number(revision): + """Return the branch_number of the branch on which REVISION lies. + + REVISION is a branch revision number with an even number of + components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2'). + The return value is the branch number (for example, '1.7.2'). + Return none iff REVISION is a trunk revision such as '1.2'.""" + + if is_trunk_revision(revision): + return None + return revision[:revision.rindex(".")] + + def rev_to_branch_data(self, revision): + """Return the branch_data of the branch on which REVISION lies. + + REVISION must be a branch revision number with an even number of + components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2'). + Raise KeyError iff REVISION is unknown.""" + + assert not is_trunk_revision(revision) + + return self.branches_data[self.rev_to_branch_number(revision)] + + def rev_to_lod(self, revision): + """Return the line of development on which REVISION lies. + + REVISION must be a revision number with an even number of + components. Raise KeyError iff REVISION is unknown.""" + + if is_trunk_revision(revision): + return self.pdc.trunk + else: + return self.rev_to_branch_data(revision).symbol + + +class _FileDataCollector(cvs2svn_rcsparse.Sink): + """Class responsible for collecting RCS data for a particular file. + + Any collected data that need to be remembered are stored into the + referenced CollectData instance.""" + + def __init__(self, pdc, cvs_file): + """Create an object that is prepared to receive data for CVS_FILE. + CVS_FILE is a CVSFile instance. COLLECT_DATA is used to store the + information collected about the file.""" + + self.pdc = pdc + self.cvs_file = cvs_file + + self.collect_data = self.pdc.collect_data + self.project = self.cvs_file.project + + # A place to store information about the symbols in this file: + self.sdc = _SymbolDataCollector(self, self.cvs_file) + + # { revision : _RevisionData instance } + self._rev_data = { } + + # Lists [ (parent, child) ] of revision number pairs indicating + # that revision child depends on revision parent along the main + # line of development. + self._primary_dependencies = [] + + # If set, this is an RCS branch number -- rcsparse calls this the + # "principal branch", but CVS and RCS refer to it as the "default + # branch", so that's what we call it, even though the rcsparse API + # setter method is still 'set_principal_branch'. + self.default_branch = None + + # True iff revision 1.1 of the file appears to have been imported + # (as opposed to added normally). + self._file_imported = False + + def _get_rev_id(self, revision): + if revision is None: + return None + return self._rev_data[revision].cvs_rev_id + + def set_principal_branch(self, branch): + """This is a callback method declared in Sink.""" + + if branch.find('.') == -1: + # This just sets the default branch to trunk. Normally this + # shouldn't occur, but it has been seen in at least one CVS + # repository. Just ignore it. + pass + else: + self.default_branch = branch + + def set_expansion(self, mode): + """This is a callback method declared in Sink.""" + + self.cvs_file.mode = mode + + def define_tag(self, name, revision): + """Remember the symbol name and revision, but don't process them yet. + + This is a callback method declared in Sink.""" + + self.sdc.define_symbol(name, revision) + + def admin_completed(self): + """This is a callback method declared in Sink.""" + + self.sdc.process_symbols() + + def define_revision(self, revision, timestamp, author, state, + branches, next): + """This is a callback method declared in Sink.""" + + for branch in branches: + try: + branch_data = self.sdc.rev_to_branch_data(branch) + except KeyError: + # Normally we learn about the branches from the branch names + # and numbers parsed from the symbolic name header. But this + # must have been an unlabeled branch that slipped through the + # net. Generate a name for it and create a _BranchData record + # for it now. + branch_data = self.sdc._add_unlabeled_branch( + self.sdc.rev_to_branch_number(branch)) + + assert branch_data.child is None + branch_data.child = branch + + if revision in self._rev_data: + # This revision has already been seen. + Log().error('File %r contains duplicate definitions of revision %s.' + % (self.cvs_file.filename, revision,)) + raise RuntimeError + + # Record basic information about the revision: + rev_data = _RevisionData( + self.collect_data.item_key_generator.gen_id(), + revision, int(timestamp), author, state) + self._rev_data[revision] = rev_data + + # When on trunk, the RCS 'next' revision number points to what + # humans might consider to be the 'previous' revision number. For + # example, 1.3's RCS 'next' is 1.2. + # + # However, on a branch, the RCS 'next' revision number really does + # point to what humans would consider to be the 'next' revision + # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2. + # + # In other words, in RCS, 'next' always means "where to find the next + # deltatext that you need this revision to retrieve. + # + # That said, we don't *want* RCS's behavior here, so we determine + # whether we're on trunk or a branch and set the dependencies + # accordingly. + if next: + if is_trunk_revision(revision): + self._primary_dependencies.append( (next, revision,) ) + else: + self._primary_dependencies.append( (revision, next,) ) + + def _resolve_primary_dependencies(self): + """Resolve the dependencies listed in self._primary_dependencies.""" + + for (parent, child,) in self._primary_dependencies: + parent_data = self._rev_data[parent] + assert parent_data.child is None + parent_data.child = child + + child_data = self._rev_data[child] + assert child_data.parent is None + child_data.parent = parent + + def _resolve_branch_dependencies(self): + """Resolve dependencies involving branches.""" + + for branch_data in self.sdc.branches_data.values(): + # The branch_data's parent has the branch as a child regardless + # of whether the branch had any subsequent commits: + try: + parent_data = self._rev_data[branch_data.parent] + except KeyError: + Log().warn( + 'In %r:\n' + ' branch %r references non-existing revision %s\n' + ' and will be ignored.' + % (self.cvs_file.filename, branch_data.symbol.name, + branch_data.parent,)) + del self.sdc.branches_data[branch_data.branch_number] + else: + parent_data.branches_data.append(branch_data) + + # If the branch has a child (i.e., something was committed on + # the branch), then we store a reference to the branch_data + # there, define the child's parent to be the branch's parent, + # and list the child in the branch parent's branches_revs_data: + if branch_data.child is not None: + child_data = self._rev_data[branch_data.child] + assert child_data.parent_branch_data is None + child_data.parent_branch_data = branch_data + assert child_data.parent is None + child_data.parent = branch_data.parent + parent_data.branches_revs_data.append(branch_data.child) + + def _sort_branches(self): + """Sort the branches sprouting from each revision in creation order. + + Creation order is taken to be the reverse of the order that they + are listed in the symbols part of the RCS file. (If a branch is + created then deleted, a later branch can be assigned the recycled + branch number; therefore branch numbers are not an indication of + creation order.)""" + + for rev_data in self._rev_data.values(): + rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id)) + + def _resolve_tag_dependencies(self): + """Resolve dependencies involving tags.""" + + for (rev, tag_data_list) in self.sdc.tags_data.items(): + try: + parent_data = self._rev_data[rev] + except KeyError: + Log().warn( + 'In %r:\n' + ' the following tag(s) reference non-existing revision %s\n' + ' and will be ignored:\n' + ' %s' % ( + self.cvs_file.filename, rev, + ', '.join([repr(tag_data.symbol.name) + for tag_data in tag_data_list]),)) + del self.sdc.tags_data[rev] + else: + for tag_data in tag_data_list: + assert tag_data.rev == rev + # The tag_data's rev has the tag as a child: + parent_data.tags_data.append(tag_data) + + def _determine_operation(self, rev_data): + prev_rev_data = self._rev_data.get(rev_data.parent) + return cvs_revision_type_map[( + rev_data.state != 'dead', + prev_rev_data is not None and prev_rev_data.state != 'dead', + )] + + def _get_cvs_revision(self, rev_data): + """Create and return a CVSRevision for REV_DATA.""" + + branch_ids = [ + branch_data.id + for branch_data in rev_data.branches_data + ] + + branch_commit_ids = [ + self._get_rev_id(rev) + for rev in rev_data.branches_revs_data + ] + + tag_ids = [ + tag_data.id + for tag_data in rev_data.tags_data + ] + + revision_type = self._determine_operation(rev_data) + + return revision_type( + self._get_rev_id(rev_data.rev), self.cvs_file, + rev_data.timestamp, None, + self._get_rev_id(rev_data.parent), + self._get_rev_id(rev_data.child), + rev_data.rev, + True, + self.sdc.rev_to_lod(rev_data.rev), + rev_data.get_first_on_branch_id(), + False, None, None, + tag_ids, branch_ids, branch_commit_ids, + rev_data.revision_recorder_token) + + def _get_cvs_revisions(self): + """Generate the CVSRevisions present in this file.""" + + for rev_data in self._rev_data.itervalues(): + yield self._get_cvs_revision(rev_data) + + def _get_cvs_branches(self): + """Generate the CVSBranches present in this file.""" + + for branch_data in self.sdc.branches_data.values(): + yield CVSBranch( + branch_data.id, self.cvs_file, branch_data.symbol, + branch_data.branch_number, + self.sdc.rev_to_lod(branch_data.parent), + self._get_rev_id(branch_data.parent), + self._get_rev_id(branch_data.child), + None, + ) + + def _get_cvs_tags(self): + """Generate the CVSTags present in this file.""" + + for tags_data in self.sdc.tags_data.values(): + for tag_data in tags_data: + yield CVSTag( + tag_data.id, self.cvs_file, tag_data.symbol, + self.sdc.rev_to_lod(tag_data.rev), + self._get_rev_id(tag_data.rev), + None, + ) + + def tree_completed(self): + """The revision tree has been parsed. + + Analyze it for consistency and connect some loose ends. + + This is a callback method declared in Sink.""" + + self._resolve_primary_dependencies() + self._resolve_branch_dependencies() + self._sort_branches() + self._resolve_tag_dependencies() + + # Compute the preliminary CVSFileItems for this file: + cvs_items = [] + cvs_items.extend(self._get_cvs_revisions()) + cvs_items.extend(self._get_cvs_branches()) + cvs_items.extend(self._get_cvs_tags()) + self._cvs_file_items = CVSFileItems( + self.cvs_file, self.pdc.trunk, cvs_items + ) + + self._cvs_file_items.check_link_consistency() + + # Tell the revision recorder about the file dependency tree. + self.collect_data.revision_recorder.start_file(self._cvs_file_items) + + def set_revision_info(self, revision, log, text): + """This is a callback method declared in Sink.""" + + rev_data = self._rev_data[revision] + cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id] + + if cvs_rev.metadata_id is not None: + # Users have reported problems with repositories in which the + # deltatext block for revision 1.1 appears twice. It is not + # known whether this results from a CVS/RCS bug, or from botched + # hand-editing of the repository. In any case, empirically, cvs + # and rcs both use the first version when checking out data, so + # that's what we will do. (For the record: "cvs log" fails on + # such a file; "rlog" prints the log message from the first + # block and ignores the second one.) + Log().warn( + "%s: in '%s':\n" + " Deltatext block for revision %s appeared twice;\n" + " ignoring the second occurrence.\n" + % (warning_prefix, self.cvs_file.filename, revision,) + ) + return + + if is_trunk_revision(revision): + branch_name = None + else: + branch_name = self.sdc.rev_to_branch_data(revision).symbol.name + + cvs_rev.metadata_id = self.collect_data.metadata_logger.store( + self.project, branch_name, rev_data.author, log + ) + cvs_rev.deltatext_exists = bool(text) + + # If this is revision 1.1, determine whether the file appears to + # have been created via 'cvs add' instead of 'cvs import'. The + # test is that the log message CVS uses for 1.1 in imports is + # "Initial revision\n" with no period. (This fact helps determine + # whether this file might have had a default branch in the past.) + if revision == '1.1': + self._file_imported = (log == 'Initial revision\n') + + cvs_rev.revision_recorder_token = \ + self.collect_data.revision_recorder.record_text(cvs_rev, log, text) + + def parse_completed(self): + """Finish the processing of this file. + + This is a callback method declared in Sink.""" + + # Make sure that there was an info section for each revision: + for cvs_item in self._cvs_file_items.values(): + if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None: + self.collect_data.record_fatal_error( + '%r has no deltatext section for revision %s' + % (self.cvs_file.filename, cvs_item.rev,) + ) + + def _process_ntdbrs(self): + """Fix up any non-trunk default branch revisions (if present). + + If a non-trunk default branch is determined to have existed, yield + the _RevisionData.ids for all revisions that were once non-trunk + default revisions, in dependency order. + + There are two cases to handle: + + One case is simple. The RCS file lists a default branch + explicitly in its header, such as '1.1.1'. In this case, we know + that every revision on the vendor branch is to be treated as head + of trunk at that point in time. + + But there's also a degenerate case. The RCS file does not + currently have a default branch, yet we can deduce that for some + period in the past it probably *did* have one. For example, the + file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are + dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated + after 1.2. In this case, we should record 1.1.1.96 as the last + vendor revision to have been the head of the default branch. + + If any non-trunk default branch revisions are found: + + - Set their ntdbr members to True. + + - Connect the last one with revision 1.2. + + - Remove revision 1.1 if it is not needed. + + """ + + try: + if self.default_branch: + vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id + vendor_lod_items = self._cvs_file_items.get_lod_items( + self._cvs_file_items[vendor_cvs_branch_id] + ) + if not self._cvs_file_items.process_live_ntdb(vendor_lod_items): + return + elif self._file_imported: + vendor_branch_data = self.sdc.branches_data.get('1.1.1') + if vendor_branch_data is None: + return + else: + vendor_lod_items = self._cvs_file_items.get_lod_items( + self._cvs_file_items[vendor_branch_data.id] + ) + if not self._cvs_file_items.process_historical_ntdb( + vendor_lod_items + ): + return + else: + return + except VendorBranchError, e: + self.collect_data.record_fatal_error(str(e)) + return + + if self._file_imported: + self._cvs_file_items.imported_remove_1_1(vendor_lod_items) + + self._cvs_file_items.check_link_consistency() + + def get_cvs_file_items(self): + """Finish up and return a CVSFileItems instance for this file. + + This method must only be called once.""" + + self._process_ntdbrs() + + # Break a circular reference loop, allowing the memory for self + # and sdc to be freed. + del self.sdc + + return self._cvs_file_items + + +class _ProjectDataCollector: + def __init__(self, collect_data, project): + self.collect_data = collect_data + self.project = project + self.num_files = 0 + + # The Trunk LineOfDevelopment object for this project: + self.trunk = Trunk( + self.collect_data.symbol_key_generator.gen_id(), self.project + ) + self.project.trunk_id = self.trunk.id + + # This causes a record for self.trunk to spring into existence: + self.collect_data.symbol_stats[self.trunk] + + # A map { name -> Symbol } for all known symbols in this project. + # The symbols listed here are undifferentiated into Branches and + # Tags because the same name might appear as a branch in one file + # and a tag in another. + self.symbols = {} + + # A map { (old_name, new_name) : count } indicating how many files + # were affected by each each symbol name transformation: + self.symbol_transform_counts = {} + + def get_symbol(self, name): + """Return the Symbol object for the symbol named NAME in this project. + + If such a symbol does not yet exist, allocate a new symbol_id, + create a Symbol instance, store it in self.symbols, and return it.""" + + symbol = self.symbols.get(name) + if symbol is None: + symbol = Symbol( + self.collect_data.symbol_key_generator.gen_id(), + self.project, name) + self.symbols[name] = symbol + return symbol + + def log_symbol_transform(self, old_name, new_name): + """Record that OLD_NAME was transformed to NEW_NAME in one file. + + This information is used to generated a statistical summary of + symbol transforms.""" + + try: + self.symbol_transform_counts[old_name, new_name] += 1 + except KeyError: + self.symbol_transform_counts[old_name, new_name] = 1 + + def summarize_symbol_transforms(self): + if self.symbol_transform_counts and Log().is_on(Log.NORMAL): + log = Log() + log.normal('Summary of symbol transforms:') + transforms = self.symbol_transform_counts.items() + transforms.sort() + for ((old_name, new_name), count) in transforms: + if new_name is None: + log.normal(' "%s" ignored in %d files' % (old_name, count,)) + else: + log.normal( + ' "%s" transformed to "%s" in %d files' + % (old_name, new_name, count,) + ) + + def _process_cvs_file_items(self, cvs_file_items): + """Process the CVSFileItems from one CVSFile.""" + + # Remove CVSRevisionDeletes that are not needed: + cvs_file_items.remove_unneeded_deletes(self.collect_data.metadata_db) + + # Remove initial branch deletes that are not needed: + cvs_file_items.remove_initial_branch_deletes( + self.collect_data.metadata_db + ) + + # If this is a --trunk-only conversion, discard all branches and + # tags, then draft any non-trunk default branch revisions to + # trunk: + if Ctx().trunk_only: + cvs_file_items.exclude_non_trunk() + + self.collect_data.revision_recorder.finish_file(cvs_file_items) + self.collect_data.add_cvs_file_items(cvs_file_items) + self.collect_data.symbol_stats.register(cvs_file_items) + + def process_file(self, cvs_file): + Log().normal(cvs_file.filename) + fdc = _FileDataCollector(self, cvs_file) + try: + cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc) + except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError): + self.collect_data.record_fatal_error( + "%r is not a valid ,v file" % (cvs_file.filename,) + ) + # Abort the processing of this file, but let the pass continue + # with other files: + return + except: + Log().warn("Exception occurred while parsing %s" % cvs_file.filename) + raise + else: + self.num_files += 1 + + cvs_file_items = fdc.get_cvs_file_items() + + del fdc + + self._process_cvs_file_items(cvs_file_items) + + +class CollectData: + """Repository for data collected by parsing the CVS repository files. + + This class manages the databases into which information collected + from the CVS repository is stored. The data are stored into this + class by _FileDataCollector instances, one of which is created for + each file to be parsed.""" + + def __init__(self, revision_recorder, stats_keeper): + self.revision_recorder = revision_recorder + self._cvs_item_store = NewCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_STORE)) + self.metadata_db = MetadataDatabase( + artifact_manager.get_temp_file(config.METADATA_STORE), + artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE), + DB_OPEN_NEW, + ) + self.metadata_logger = MetadataLogger(self.metadata_db) + self.fatal_errors = [] + self.num_files = 0 + self.symbol_stats = SymbolStatisticsCollector() + self.stats_keeper = stats_keeper + + # Key generator for CVSFiles: + self.file_key_generator = KeyGenerator() + + # Key generator for CVSItems: + self.item_key_generator = KeyGenerator() + + # Key generator for Symbols: + self.symbol_key_generator = KeyGenerator() + + self.revision_recorder.start() + + def record_fatal_error(self, err): + """Record that fatal error ERR was found. + + ERR is a string (without trailing newline) describing the error. + Output the error to stderr immediately, and record a copy to be + output again in a summary at the end of CollectRevsPass.""" + + err = '%s: %s' % (error_prefix, err,) + Log().error(err + '\n') + self.fatal_errors.append(err) + + def add_cvs_directory(self, cvs_directory): + """Record CVS_DIRECTORY.""" + + Ctx()._cvs_file_db.log_file(cvs_directory) + + def add_cvs_file_items(self, cvs_file_items): + """Record the information from CVS_FILE_ITEMS. + + Store the CVSFile to _cvs_file_db under its persistent id, store + the CVSItems, and record the CVSItems to self.stats_keeper.""" + + Ctx()._cvs_file_db.log_file(cvs_file_items.cvs_file) + self._cvs_item_store.add(cvs_file_items) + + self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file) + for cvs_item in cvs_file_items.values(): + self.stats_keeper.record_cvs_item(cvs_item) + + def _get_cvs_file( + self, parent_directory, basename, file_in_attic, leave_in_attic=False + ): + """Return a CVSFile describing the file with name BASENAME. + + PARENT_DIRECTORY is the CVSDirectory instance describing the + directory that physically holds this file in the filesystem. + BASENAME must be the base name of a *,v file within + PARENT_DIRECTORY. + + FILE_IN_ATTIC is a boolean telling whether the specified file is + in an Attic subdirectory. If FILE_IN_ATTIC is True, then: + + - If LEAVE_IN_ATTIC is True, then leave the 'Attic' component in + the filename. + + - Otherwise, raise FileInAndOutOfAtticException if a file with the + same filename appears outside of Attic. + + The CVSFile is assigned a new unique id. All of the CVSFile + information is filled in except mode (which can only be determined + by parsing the file). + + Raise FatalError if the resulting filename would not be legal in + SVN.""" + + filename = os.path.join(parent_directory.filename, basename) + try: + verify_svn_filename_legal(basename[:-2]) + except IllegalSVNPathError, e: + raise FatalError( + 'File %r would result in an illegal SVN filename: %s' + % (filename, e,) + ) + + if file_in_attic and not leave_in_attic: + in_attic = True + logical_parent_directory = parent_directory.parent_directory + + # If this file also exists outside of the attic, it's a fatal + # error: + non_attic_filename = os.path.join( + logical_parent_directory.filename, basename, + ) + if os.path.exists(non_attic_filename): + raise FileInAndOutOfAtticException(non_attic_filename, filename) + else: + in_attic = False + logical_parent_directory = parent_directory + + file_stat = os.stat(filename) + + # The size of the file in bytes: + file_size = file_stat[stat.ST_SIZE] + + # Whether or not the executable bit is set: + file_executable = bool(file_stat[0] & stat.S_IXUSR) + + # mode is not known, so we temporarily set it to None. + return CVSFile( + self.file_key_generator.gen_id(), + parent_directory.project, logical_parent_directory, basename[:-2], + in_attic, file_executable, file_size, None + ) + + def _get_attic_file(self, parent_directory, basename): + """Return a CVSFile object for the Attic file at BASENAME. + + PARENT_DIRECTORY is the CVSDirectory that physically contains the + file on the filesystem (i.e., the Attic directory). It is not + necessarily the parent_directory of the CVSFile that will be + returned. + + Return CVSFile, whose parent directory is usually + PARENT_DIRECTORY.parent_directory, but might be PARENT_DIRECTORY + iff CVSFile will remain in the Attic directory.""" + + try: + return self._get_cvs_file(parent_directory, basename, True) + except FileInAndOutOfAtticException, e: + if Ctx().retain_conflicting_attic_files: + Log().warn( + "%s: %s;\n" + " storing the latter into 'Attic' subdirectory.\n" + % (warning_prefix, e) + ) + else: + self.record_fatal_error(str(e)) + + # Either way, return a CVSFile object so that the rest of the + # file processing can proceed: + return self._get_cvs_file( + parent_directory, basename, True, leave_in_attic=True + ) + + def _generate_attic_cvs_files(self, cvs_directory): + """Generate CVSFiles for the files in Attic directory CVS_DIRECTORY. + + Also add CVS_DIRECTORY to self if any files are being retained in + that directory.""" + + retained_attic_file = False + + fnames = os.listdir(cvs_directory.filename) + fnames.sort() + for fname in fnames: + pathname = os.path.join(cvs_directory.filename, fname) + if os.path.isdir(pathname): + Log().warn("Directory %s found within Attic; ignoring" % (pathname,)) + elif fname.endswith(',v'): + cvs_file = self._get_attic_file(cvs_directory, fname) + if cvs_file.parent_directory == cvs_directory: + # This file will be retained in the Attic directory. + retained_attic_file = True + yield cvs_file + + if retained_attic_file: + # If any files were retained in the Attic directory, then write + # the Attic directory to CVSFileDatabase: + self.add_cvs_directory(cvs_directory) + + def _get_non_attic_file(self, parent_directory, basename): + """Return a CVSFile object for the non-Attic file at BASENAME.""" + + return self._get_cvs_file(parent_directory, basename, False) + + def _generate_cvs_files(self, cvs_directory): + """Generate the CVSFiles under non-Attic directory CVS_DIRECTORY. + + Process directories recursively, including Attic directories. + Also create and register CVSDirectories as they are found, and + look for conflicts between the filenames that will result from + files, attic files, and subdirectories.""" + + self.add_cvs_directory(cvs_directory) + + # Map {cvs_file.basename : cvs_file.filename} for files directly + # in cvs_directory: + rcsfiles = {} + + attic_dir = None + + # Non-Attic subdirectories of cvs_directory (to be recursed into): + dirs = [] + + fnames = os.listdir(cvs_directory.filename) + fnames.sort() + for fname in fnames: + pathname = os.path.join(cvs_directory.filename, fname) + if os.path.isdir(pathname): + if fname == 'Attic': + attic_dir = fname + else: + dirs.append(fname) + elif fname.endswith(',v'): + cvs_file = self._get_non_attic_file(cvs_directory, fname) + rcsfiles[cvs_file.basename] = cvs_file.filename + yield cvs_file + else: + # Silently ignore other files: + pass + + # Map {cvs_file.basename : cvs_file.filename} for files in an + # Attic directory within cvs_directory: + attic_rcsfiles = {} + + if attic_dir is not None: + attic_directory = CVSDirectory( + self.file_key_generator.gen_id(), + cvs_directory.project, cvs_directory, 'Attic', + ) + + for cvs_file in self._generate_attic_cvs_files(attic_directory): + if cvs_file.parent_directory == cvs_directory: + attic_rcsfiles[cvs_file.basename] = cvs_file.filename + yield cvs_file + + alldirs = dirs + [attic_dir] + else: + alldirs = dirs + + # Check for conflicts between directory names and the filenames + # that will result from the rcs files (both in this directory and + # in attic). (We recurse into the subdirectories nevertheless, to + # try to detect more problems.) + for fname in alldirs: + pathname = os.path.join(cvs_directory.filename, fname) + for rcsfile_list in [rcsfiles, attic_rcsfiles]: + if fname in rcsfile_list: + self.record_fatal_error( + 'Directory name conflicts with filename. Please remove or ' + 'rename one\n' + 'of the following:\n' + ' "%s"\n' + ' "%s"' + % (pathname, rcsfile_list[fname],) + ) + + # Now recurse into the other subdirectories: + for fname in dirs: + dirname = os.path.join(cvs_directory.filename, fname) + + # Verify that the directory name does not contain any illegal + # characters: + try: + verify_svn_filename_legal(fname) + except IllegalSVNPathError, e: + raise FatalError( + 'Directory %r would result in an illegal SVN path name: %s' + % (dirname, e,) + ) + + sub_directory = CVSDirectory( + self.file_key_generator.gen_id(), + cvs_directory.project, cvs_directory, fname, + ) + + for cvs_file in self._generate_cvs_files(sub_directory): + yield cvs_file + + def process_project(self, project): + Ctx()._projects[project.id] = project + + root_cvs_directory = CVSDirectory( + self.file_key_generator.gen_id(), project, None, '' + ) + project.root_cvs_directory_id = root_cvs_directory.id + pdc = _ProjectDataCollector(self, project) + + found_rcs_file = False + for cvs_file in self._generate_cvs_files(root_cvs_directory): + pdc.process_file(cvs_file) + found_rcs_file = True + + if not found_rcs_file: + self.record_fatal_error( + 'No RCS files found under %r!\n' + 'Are you absolutely certain you are pointing cvs2svn\n' + 'at a CVS repository?\n' + % (project.project_cvs_repos_path,) + ) + + pdc.summarize_symbol_transforms() + + self.num_files += pdc.num_files + Log().verbose('Processed', self.num_files, 'files') + + def _set_cvs_path_ordinals(self): + cvs_files = list(Ctx()._cvs_file_db.itervalues()) + cvs_files.sort(CVSPath.slow_compare) + for (i, cvs_file) in enumerate(cvs_files): + cvs_file.ordinal = i + + def close(self): + """Close the data structures associated with this instance. + + Return a list of fatal errors encountered while processing input. + Each list entry is a string describing one fatal error.""" + + self.revision_recorder.finish() + self.symbol_stats.purge_ghost_symbols() + self.symbol_stats.close() + self.symbol_stats = None + self.metadata_logger = None + self.metadata_db.close() + self.metadata_db = None + self._cvs_item_store.close() + self._cvs_item_store = None + self._set_cvs_path_ordinals() + self.revision_recorder = None + retval = self.fatal_errors + self.fatal_errors = None + return retval + + |