pgn-compare.py

#! /usr/bin/python
# -*- coding: utf-8 -*-
#
# iso-8859-15, latin-1 or utf-8, etc

# ----------------------------------------------------------------------------------------------------------------------------------------
# (C) Zanzibar - no warranty of any kind provided, nor guarantee of correctness or suitability.
#                           Code is provided with only the restriction that no transfer of copyright is expressed.
# ----------------------------------------------------------------------------------------------------------------------------------------

import os, sys, time, math, re, types
import urllib, urllib2
import cPickle, string

from urllib import urlopen

# ----- Globals ----- #

canonical_results = \
   [ '1-0', '0-1', '1/2-1/2', '*', '=', '1', '0', '½', '½-½' ]

fn = {}
FIDE_remap = {}

# ---------------------------------------------------------------
# Hand-edit/uncomment this section to avoid command line input,
# and to control name mapping explicitly.
# ---------------------------------------------------------------
#
#
# dirname =  "H:/Chess/Tournaments/2000/2014/Dubai WRB/"
# 
# fn = {
#    "CG"   : dirname + "CG-Dubai-Blitz-2014.pgn",
#    "FIDE" : dirname + "WRB_Blitz_Full.pgn",
#    "TWIC" : dirname + "TWIC-Dubai-Blitz-2014.pgn"
# }
# 
# # Default, will call name_match() to align names.
# 
# FIDE_remap = {
# 
#    "Al-Sayed, Mohammed"          : "Mohamad Naser Al Sayed",
#    "AlHuwar, Jasem"              : "A R Saleh Jasim",
#    "Vahidov, Tair"               : "Tahir Vakhidov",
#    "Van Wely, Loek"              : "Loek van Wely",
#    "Iturrizaga Bonelli, Eduardo" : "E Iturrizaga"
# }
# ---------------------------------------------------------------


# ------------
# class PGN:
# ------------

class PGN:
   
   # -------------------------------------
   # def __init__( self, tags, pgn ):
   # -------------------------------------

   def __init__( self, tags, pgn ):

      for i in tags:
         self.__dict__[i] = tags[i]
      self.pgn = pgn

      
   # -------------------------
   # def __str__( self ):
   # -------------------------

   def __str__( self ):

      # Special processing for mismatch results within one game.
      
      result0 = self.Result

      if '(' in self.Result and self.Result[-1] == ')':
         ik = self.Result.rfind('(')
         result1 = self.Result[ik+1:-1]
         self.Result = self.Result[:ik].strip()
      else:
         result1 = self.Result 

      # Back to normal.
      T = [ "Event", "Site", "Date", "Round",
            "White", "Black", "Result",
            "WhiteTitle",  "BlackTitle",
            "WhiteElo",    "BlackElo",
            "WhiteFideId", "BlackFideId",
            "ECO", "Opening", "EventDate" ]
      
      rc = "\n"
      for i in T:
         try:
            rc += '[' + i + ' "%s"]\n' % self.__dict__[i]
         except:
            pass
            
      for i in self.__dict__:
         if i in T or i in [ 'pgn' ]: continue
         rc += '[' + i + ' "%s"]\n' % self.__dict__[i]
         
      rc += "\n" + self.pgn + " " + result1 + "\n\n"

      self.Result = result0

      return rc

      
   # --------------------------
   # def __repr__( self ):
   # --------------------------

   def __repr__( self ):

      rc = self.White
      # try: rc += " (" + self.WhiteElo + ")"
      # except: pass
      rc += " -- "
      rc += self.Black      
      # try: rc += " (" + self.BlackElo + ")"
      # except: pass
      
      try: rc += " (R" + self.Round + ")"
      except: pass
      
      try: rc += " " + self.Result
      except: pass
      
      # rc += " - " + self.Event + ", "
      # rc += self.Site

      try: rc += " (" + self.ECO + ")"
      except: pass

      rc += " " + self.Date

      return rc


# -------------------------------------------------------------
# Find duplicate games within each version of the tournament.
# -------------------------------------------------------------

class Dup:

   def __init__( self, CG, FIDE ):

      self.cg   = []
      self.fide = []
      
      for i in xrange( len(CG) ):
         for j in xrange( i+1, len(CG) ):

            if CG[i].White == CG[j].White and \
               CG[i].Black == CG[j].Black:
               self.cg.append( (CG[i],CG[j]) )

            if CG[i].White == CG[j].Black and \
               CG[i].Black == CG[j].White:
               self.cg.append( (CG[i],CG[j]) )

      for i in xrange( len(FIDE) ):
         for j in xrange( i+1, len(FIDE) ):

            if FIDE[i].White == FIDE[j].White and \
               FIDE[i].Black == FIDE[j].Black:
               self.cg.append( (FIDE[i],FIDE[j]) )

            if FIDE[i].White == FIDE[j].Black and \
               FIDE[i].Black == FIDE[j].White:
               self.cg.append( (FIDE[i],FIDE[j]) )
      
      
# -----------------
# class Mismatch:
# -----------------

class Name:
   def __init__( self ):
      self.cg   = []
      self.fide = []

class Mismatch:
   def __init__( self ):
      self.pgn    = []
      self.result = []
      self.colors = []

class Missing:
   def __init__( self ):
      self.cg   = []
      self.fide = []

      self.name = Name()
      
      self.name.cg   = []
      self.name.fide = []

      
# ----------------
# def display():
# ----------------

def display():
   
   print
   if not missing.name.cg:
      print "CG missing names: None."
   else:
      print "CG missing names (not found in FIDE):\n"
      for i in missing.cg.name:
         print "  ", i
   print 
   if not missing.name.fide:
      print "FIDE missing names: None"
   else:
      print "FIDE missing names (not found in CG):\n"
      for i in missing.name.fide:
         print "  ", i
   print
   
   print    
   if not missing.cg:
      print "CG missing games: None"
   else:
      print "CG missing games (not found in FIDE):\n"
      for i in missing.cg: print "  ", `i`
   print
   
   if not missing.fide:
      print "FIDE missing games: None"
   else:
      print "FIDE missing games (not found in CG):\n"
      for i in missing.fide: print "  ", `i`
   print
   
   print
   if not mismatch.result:
      print "Result mismatches: None\n"
   else:
      print "Result mismatches (CG result, then FIDE):\n"
      for i in mismatch.result:
         result = [ None, None ]
         for k in xrange(2):
            result[k] = i[k].Result
            if result[k] == "1/2-1/2": result[k] = " = "
         print "%3s  %3s  %s" % (result[0], result[1], `i[1]`)
   print
   
   # Now print out mismatched movelists.
   
   print
   if not mismatch.pgn:
      print "PGN mismatches: None\n"
   else:
      print
      print "PGN mismatches (CG then FIDE):\n"
      for i in mismatch.pgn:
         print "  ", `i[0]`, "\n", "  ", `i[1]`, "\n"
         kmax =  min( [len(g.pgn) for g in i] )
         for k in xrange( kmax ):
            if i[0].pgn[k] != i[1].pgn[k]: 
               while k and i[0].pgn[k] != '.': k -= 1
               while k and i[0].pgn[k] != ' ': k -= 1
               break
         k += 1
         
         print "      CG>", " ".join( i[0].pgn[k:].split( " " )[:8] ).strip()
         print "    FIDE>", " ".join( i[1].pgn[k:].split( " " )[:8] ).strip()
         print      
   print
   
   # Next, find duplicates within the same PGN (tournament).
   # (We are assuming no double round robins at the moment).
   
   print
   if dup.cg:
      print "CG duplicates (within PGN/tournament):"
      for i in dup.cg:
         print
         for k in i:
            print "  Game #%2.2d>" % (CG.index(k)+1), `k`
            
         if i[0].pgn == i[1].pgn: continue
         
         print
         kmax =  min( [len(g.pgn) for g in i] )
         for k in xrange( kmax ):
            if i[0].pgn[k] != i[1].pgn[k]:
               while k and i[0].pgn[k] != '.': k -= 1
               while k and i[0].pgn[k] != ' ': k -= 1
               break
         k += 1
         # if k != kmax: k = i[0].pgn[:k].rfind( " " ) + 1
         
         print "       #%2.2d>" % (CG.index(i[0])+1), \
               " ".join( i[0].pgn[k:].split( " " )[:8] ).strip()
         print "       #%2.2d>" % (CG.index(i[1])+1), \
               " ".join( i[1].pgn[k:].split( " " )[:8] ).strip()
      print
   print
   
   if dup.fide:
      print "FIDE duplicates (within PGN/tournament):"
      for i in dup.fide:
         print
         for k in i:
            print "  Game #%2.2d>" % (FIDE.index(k)+1), `k`
            
         if i[0].pgn == i[1].pgn: continue
         
         print
         kmax =  min( [len(g.pgn) for g in i] )
         for k in xrange( kmax ):
            if i[0].pgn[k] != i[1].pgn[k]: 
               while k and i[0].pgn[k] != '.': k -= 1
               while k and i[0].pgn[k] != ' ': k -= 1
               break
         k += 1
            
         print "       #%2.2d>" % (FIDE.index(i[0])+1), \
               " ".join( i[0].pgn[k:].split( " " )[:8] ).strip()
         print "       #%2.2d>" % (FIDE.index(i[1])+1), \
               " ".join( i[1].pgn[k:].split( " " )[:8] ).strip()
      print

   # Now print out the name mapping for non-trivial matches.

   print
   print "Non-trivial FIDE -> CG name mapping.\n"
   
   for i in FIDE_remap:
      L = i.split(',')
      L.reverse()
      n = " ".join( L ).strip()
      if n != FIDE_remap[i]:
         print "  %-35s  %s" % ( i, FIDE_remap[i] )
   print
   
      
# ---------------------
# def cg_lookup( s ):
# ---------------------

def cg_lookup( s ):
    s = s.split(","); s.reverse()
    s = " ".join(s);  s = s.strip()
    print "\n" + s
    N = [ i for i in cg if s == i[-1] ]
    for n in N:
       print n
    print
    if len(N) == 1: return (s, n[0])
    return ( s, tuple( [ n[0] for n in N ] ) )

# ---------------------
# def pgn_load( fn ):
# ---------------------

def pgn_load( fn ):
   '''
   pgn_load( fn ) - load given PGN file, return array of py PGN.
   '''
   
   i = 0
   G = []
   M = []

   f = open( fn ); L = f.readlines(); f.close()

   #
   # Find first game.

   while L[i][0] != '[': i += 1

   n = 0
   while i 

      if i in remap:
         FIDE_remap[ i ] = remap[i]
         continue
      
      #
      # Look for simple exact matches first.
      
      M = [ k for k in name.cg if i == k ]
      if len(M) == 1:
         FIDE_remap[ i ] = M[0]
         continue
      
      # Look for uncollated exact match.
      
      L = i.split(',');  surname = L[0]
      L.reverse()
      
      s = " ".join( L ).strip()
      
      M = [ k for k in name.cg if s == k ]
      if len(M) == 1:
         FIDE_remap[ i ] = M[0]; continue
         
      # Look for Chinese/Vietnamese/Korean/Japanese/etc ordering.
      
      s = i.replace(',','')
      
      M = [ k for k in name.cg if s == k ]
      if len(M) == 1:
         FIDE_remap[ i ] = M[0]; continue
      
      # Now look for names where all words match in a CG name.
      
      M = [ k for k in name.cg if all( [ j in k for j in s.split() ] ) ]
      if len(M) == 1:
         FIDE_remap[ i ] = M[0]; continue
         
      # Next, look for surname matching.
      
      M = [ k for k in name.cg if surname in k ]
      if len(M) == 1:
         FIDE_remap[ i ] = M[0]; continue
         
   #
   # Now clean up unmapped names.
   
   FIDE_unmapped = [ i for i in name.fide if i not in FIDE_remap ]
   CG_unmapped = [ i for i in name.cg if i not in FIDE_remap.values() ]
   
   # for i in FIDE_unmapped:
   
   if FIDE_unmapped:
      print "\nNot all names matched, continuing manually:\n"
      for i in FIDE_unmapped:
         print "Unmapped FIDE name '%s'" % i
         if not CG_unmapped: continue
         print 
         for ik in xrange( len( CG_unmapped ) ):
            print " %d)" % ik, CG_unmapped[ ik ]
         try:
            s = raw_input( "\nEnter choice: " ).strip()
            if not s:
               ik = 0
            elif s == "q":
               break
            else:
               ik = int( s ) 
            FIDE_remap[ i ] = CG_unmapped[ik]
            del( CG_unmapped[ik] )
         except:
            print 
            pass
   print
   
   FIDE_unmapped = [ i for i in name.fide if i not in FIDE_remap ]
   CG_unmapped   = [ i for i in name.cg   if i not in FIDE_remap.values() ]
   
   return ( FIDE_remap, FIDE_unmapped, CG_unmapped )

   
# -----
# Main 
# -----

i = 1
while sys.argv[i][0] == '-':
   try:
      if sys.argv[i][1] == 'f': fn = {}
      if sys.argv[i][1] == 'r': FIDE_remap = {}
   except:
      pass
   i += 1
      
try:
   if not fn:
      fn[ "CG"   ] = sys.argv[i]; i += 1
      fn[ "FIDE" ] = sys.argv[i]; i += 1
      
except:
   pass

#Load the input PGN files.

CG   = pgn_load( fn[ "CG"   ] )
FIDE = pgn_load( fn[ "FIDE" ] )

# Player name data structure.

name = Name()

#
# Missing and mismatched games data structures.

missing  = Missing()
mismatch = Mismatch()

# 

name.cg = []
for i in CG:
   if i.White not in name.cg: name.cg.append( i.White )
   if i.Black not in name.cg: name.cg.append( i.Black )

name.fide = []
for i in FIDE:
   if i.White not in name.fide: name.fide.append( i.White )
   if i.Black not in name.fide: name.fide.append( i.Black )

for L in ( name.cg, name.fide ): L.sort()

FIDE_org_names = [ i for i in name.fide ]

#
# Find the name mapping.

# Call the remapping function (which might have been preempted, see above).

( FIDE_remap, FIDE_unmapped, CG_unmapped ) = name_match( FIDE_remap )

for i in range( len(name.fide) ):
   if name.fide[i] in FIDE_remap:
      name.fide[i] = FIDE_remap[ name.fide[i] ]

#
# Now determine if any names are unmatched.

missing.name.cg   = [ i for i in name.cg   if i not in name.fide ]
missing.name.fide = [ i for i in name.fide if i not in name.cg   ]

#
# Normalize the pgn.

for i in CG:
   i.pgn = " ".join( i.pgn.split() )

for i in FIDE:
   i.pgn = " ".join( i.pgn.split() )
   
   try:    i.White = FIDE_remap[ i.White ]
   except: pass
   
   try:    i.Black = FIDE_remap[ i.Black ]
   except: pass

#   
# Find duplicate games.

dup = Dup( CG, FIDE )

for g in CG:

   for f in FIDE:
      if f.White == g.White and f.Black == g.Black:
         break
      if f.White == g.Black and f.Black == g.White:
         mismatch.colors.append( (g,f) )
         break
   else:
      missing.cg.append( g )
      continue

   if g.Result != f.Result:
      mismatch.result.append( (g,f) )

   if g.pgn != f.pgn:
      #
      # Need to allow result to be different, we catalogue that separately.
      if " ".join( g.pgn.split()[:-1] ) != " ".join( f.pgn.split()[:-1] ):
         mismatch.pgn.append( (g,f) )

      
for f in FIDE:

   for g in CG:
      if f.White == g.White and f.Black == g.Black: break
   else:
      missing.fide.append( f )
      continue
   
display()

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s