#!usr/bin/python # -*- coding: utf-8 -*- # -------------------------------------------------- # File Name: analyseXML.py # Location: # Purpose: # Creation Date: 06-07-2017 # Last Modified: Thu, Jan 25, 2018 5:38:23 PM # Author(s): Mike Stout # Copyright 2017 The Author(s) All Rights Reserved # Credits: # -------------------------------------------------- import codecs import xml.dom.minidom as minidom import sys import re prefix, play, f_filename, q_filename = sys.argv[1:] doTKL = 1 doMetrics = 1 import numpy as np import pandas as pd pd.set_option('display.max_rows', 25) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) pd.set_option('chop_threshold', 0) pd.set_option('precision', 4) pd.set_option('display.colheader_justify','right') pd.set_option('display.max_colwidth', -1) import matplotlib.pyplot as plt plt.style.use('ggplot') six_colors = ['red', 'coral', 'lightcoral', 'blue', 'slateblue', 'royalblue'] two_colors = ['red', 'blue'] #f = sys.argv[1] def hack_utf8_to_ascii(s): return s.encode('ascii', 'ignore') def words(s): return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\'~%]', s)) def words_(s): return filter(lambda x: x!='', re.split('[ \n\t]', s)) ''' text='hello,how are you?I am fine,thank you. And you?' print text print words(text) exit() ''' tkl_word_groups = map(words, [ 'which_rel that who' , 'do does doth' , 'these this those' , 'thine thy' ]) for g in tkl_word_groups: print g def getTklWordGroup(w): def ok(g): return w in g return filter(ok, tkl_word_groups)[0] all_tkl_word_tokens = [ w for kws in tkl_word_groups for w in kws ] function_words = \ [ 'for' , 'in' ] def isFunctionWord(w): #res = w in function_words res = any( [ w in all_tkl_word_tokens ] ) return res #N=int(sys.argv[1]) def isProse(s): return (s!=' ' and '~' not in s) def isVerse(s): return (s!=' ' and '~' in s) def getVerseLines(txt): txt_ = re.split('[@#%]',txt) xs = filter(lambda x: isVerse(x), txt_) ''' if "Courtesan" in txt: print xs print len(xs) print map(len,map(words,xs)) #exit() ''' return xs def getProseLines(txt): txt_ = re.split('[@#%]',txt) #print txt_ xs = filter(lambda x: isProse(x), txt_) ''' if "Courtesan" in txt: print xs print len( xs) print map(len,map(words,xs)) exit() ''' return xs def getVerseWords(txt): xs = ' '.join(getVerseLines(txt)) return words(xs) def getProseWords(txt): xs = ' '.join(getProseLines(txt)) return words(xs) def divide(a,b): try: res = float(a) / float(b) except: res = 0 return res # https://stackoverflow.com/questions/15450192/fastest-way-to-compute-entropy-in-python def entropy(labels): prob_dict = {x:labels.count(x)/float(len(labels)) for x in labels} probs = np.array(list(prob_dict.values())) return - probs.dot(np.log2(probs)) def analyse(a): txt = a[-1] num_chars = len(txt) ws = words(txt) #k = len(ws) #if k==N: # ws[0]=='Fathom': # print ws # #exit() num_words = len(ws) # Only count verse lines (delimited by '@') ... verse_lines = getVerseLines(txt) num_verse_lines = len(verse_lines) #or 1 verse_words = getVerseWords(txt) num_verse_words = len(verse_words) #or 1 w_per_vl = divide(num_verse_words, num_verse_lines) # Only count prose lines (delimited by '#') ... prose_lines = getProseLines(txt) #print "PROSE", prose_lines num_prose_lines = len(prose_lines) #or 1 prose_words = getProseWords(txt) num_prose_words = len(prose_words) #or 1 w_per_pl = divide(num_prose_words, num_prose_lines) func_words = filter(isFunctionWord, ws) num_func_words = float(len(func_words)) #print num_prose_lines, num_prose_words, num_verse_lines, num_verse_words, txt #v2p = float(num_verse_words) - float(num_prose_words) v2p = float(w_per_vl) - float(w_per_pl) ent = entropy(ws) tkl_analysis = [] for w in all_tkl_word_tokens: pred = lambda x: w.lower() in x.lower() f_words = filter(pred, ws) num_f_words = float(len(f_words)) tkl_analysis.append(num_f_words) return [num_chars, num_words, num_verse_words, num_prose_words, num_verse_lines, num_prose_lines, v2p, w_per_vl, w_per_pl, num_func_words, ent] + tkl_analysis #------------------------------------------------------------------------------- # Make Canonical Name LUT ... with open(prefix+"/"+play+'/'+play+'-names.txt', 'r') as fh: name_lut = [] i=0 for line in fh: if i==0: _,col1,col2 = line[:-1].split('\t') if i>0 and len(line)>1: n,q,f = line.split('\t') qs = q.split(',') fs = f.split(',') for k in qs: name_lut.append((col1+'_'+k.strip(),n)) for k in fs: name_lut.append((col2+'_'+k.strip(),n)) i+=1 name_lut = dict(name_lut) print name_lut nameIds = dict([ (k,i) for i,k in enumerate(name_lut.keys() ) ]) print nameIds #exit() #------------------------------------------------------------------------------- # Make Canonical Scene Act LUT ... with open(prefix+"/"+play+'/'+play+'-scenes.txt', 'r') as fh: scene_lut = [] inv_scene_lut = [] act_lut = [] i=0 for line in fh: if i>0 and line[0] in ['P','-']+map(str,xrange(10)): q,f = line[:-1].split('\t') q_str = q print ">", q,f if q=="Prologue": q = -1 elif q=="Epilogue": q = 99 elif q=="-": q = 0 elif "." in q_str: q = i else: q = int(q) if f=="-": a,f = -1,-1 elif f=="Epilogue": a,f = 99,99 elif f=="Prologue": a,f = -1,-1 elif "." not in f: a = 1 f = int(f) else: a,f = map(int, f.split('.') ) scene_lut.append(( (a,f),q)) inv_scene_lut.append((q,f)) act_lut.append(( q, a) ) i+=1 scene_lut = dict(scene_lut) print scene_lut inv_scene_lut = dict(inv_scene_lut) print inv_scene_lut act_lut = dict(act_lut) print act_lut #exit() #------------------------------------------------------------------------------- #which_as_relative = # '<seg subtype='relative' type='homograph'>which_rel</seg>' def get_all_text( node ): # do not include stage directions ... if node.nodeName == 'stage': return '' if node.nodeType == node.TEXT_NODE: return node.data else: txt = '' for child_node in node.childNodes: # to start indicate additional text ... if child_node.nodeName == 'add': txt += '>' # Add a delimter '@' for verse lines ... if child_node.nodeName == 'l': txt += '@~' # Add a delimter '%' for prose ... if child_node.nodeName == 'p': txt += '%' # Add a delimter '#' for prose ... if child_node.nodeName == 'lb': txt += '#' # Parse which as a rleative ... if child_node.nodeName == 'seg' \ and child_node.attributes['type'].value=='homograph'\ and child_node.attributes['subtype'].value=='relative' \ and child_node.firstChild.nodeValue.lower() == 'which': child_node.firstChild.replaceWholeText('which_rel') t = get_all_text( child_node ) t = get_all_text( child_node ) # to indicate end of additional text ... if child_node.nodeName == 'add': t += '<' txt += ' '+t+' ' txt = hack_utf8_to_ascii(txt.replace('\n',' ')) txt = ' '.join(words_(txt)) return txt def getSpeaker(xml): for el in xml.getElementsByTagName('speaker'): return get_all_text(el) def getSps(ed, xml): sps = [] for n,el in enumerate(xml.getElementsByTagName('sp')): who = el.getAttribute('who') s = getSpeaker(el) speaker_ = name_lut[ed+'_'+who] t = get_all_text(el)[len(s):] ''' #except: if 0: print n, who, speaker_ print el.toxml() exit() ''' sps.append([n+1, who, speaker_, s, t]) return sps def fixSplitSpeeches(ed, act, scene, sps): pairs = zip(sps, sps[1:]) with open(ed+'_splitSpeeches.txt', 'w') as text_file: for a,b in pairs: if a[1]==b[1]: # ... the "who" elements match? s = '\n\n'+ ','.join(map(str, [ed,act,scene])) text_file.write(s) s = '\n'+ ','.join(map(str, a)) text_file.write(s) s = '\n'+ ','.join(map(str, b)) text_file.write(s) #exit() return sps def getScenes(ed, act, subset, xml): scs = [] for el in xml.getElementsByTagName('div2'): if el.getAttribute('type') == 'scene': scene = el.getAttribute('n') print 'scene: ', n sps = getSps(ed, el) # Find speeches split by stage directions .... if subset not in ["Only", "Common"]: sps = fixSplitSpeeches(ed, act, scene, sps) for sp in sps: scs.append([int(scene)] + sp) return scs def proc(f): global play #play = 'Ham' # ../dissectXMLadditions/' xml = minidom.parse(prefix+'/'+play+'/'+f) play, ed, Subset = f.split('.')[0].split('_') print ed, Subset acts = [] for el in xml.getElementsByTagName('div1'): if el.getAttribute('type') == 'act': act = el.getAttribute('n') print 'act: ', act scs = getScenes(ed, act, Subset, el) for sc in scs: j=sc[0] acts.append([ ed, Subset, int(act)] + sc) if el.getAttribute('type') == 'scene': n = el.getAttribute('n') sps = getSps(ed, el) for sp in sps: acts.append([ed, Subset, int(n), 0] + sp) print acts #exit() return acts #------------------------------------------------------------------------------- res = [] rs = [] #for f in ['F_HC.xml', 'Q_HC.xml']: #for f in 'F_plus.xml F_minus.xml Q_plus.xml Q_minus.xml'.split(' '): for f in [f_filename, q_filename]: print f rs = proc(f) print len(rs) for r in rs: stats = analyse(r) res.append(stats + r) #exit() df = pd.DataFrame(res) metric_names = ['#chars','#words' , '#v_words', '#p_words' , '#v_lines', '#p_lines' , 'v2p', '#w_per_vl' , '#w_per_pl', '#f_words' , 'ent' ] colNames = metric_names + \ all_tkl_word_tokens + \ ['Edtn','Subset', 'div1', 'div2','sp#','who_','Speaker', 'spkr','txt'] df.columns = colNames # fix numeric sorting of columns ... for c in ['div1', 'div2','sp#']: i = colNames.index(c) df[df.columns[i]] = df[df.columns[i]].astype(int) # Fix Ordering of editions ... def fixEd(r): if r.Edtn=='F': return 'F' else: return ' '+r.Edtn df['Edition'] = df.apply(lambda row: fixEd(row), axis=1) #------------------------------------------------------------------------------- def mkPlot(filename, title, xlab, ylab, df, rot, k): colors = two_colors if k==2: colors = six_colors df.plot(kind='bar', width=.9, rot=rot, color=colors) # .legend() plt.suptitle(title, fontsize=12) plt.legend(prop={'size': 6}) plt.xlabel(xlab, fontsize=6) plt.ylabel(ylab, fontsize=10) bars = [ x[1] for x in df.index.tolist()] # rows[0] # rowNames # [0] #except: bars = xrange(5) plt.tick_params(axis=u'both', which=u'both',length=0) plt.xticks(range(len(bars)), bars, size='small') #plt.show() plt.tight_layout() filename = filename.replace(' ', '_') plt.savefig(play+'_data/'+filename+'.png') plt.close() def outputTable(filename, df): filename = filename.replace(' ', '_') html = df.to_html() with open(play+'_data/'+filename+'.txt', 'w') as text_file: text_file.write(html) # Calulate Proportions... #------------------------------------------------------------------------------- # Assign F Acts to Q def assignAct(df, r): if play=="Rom": return 1 else: if r.Edtn == 'F': return r.div1 else: if r.div2: return r.div1 else: return act_lut[ r.div1 ] df['Act'] = df.apply(lambda row: assignAct(df, row), axis=1) #------------------------------------------------------------------------------- # Fix F div# 17 -> Q div# 18 print scene_lut def assignDiv(df, r): if play=="Rom": return scene_lut[(1, r.div1)] if r.Edtn[:1] == 'Q': if r.div2: return scene_lut[(r.div1, r.div2)] else: return r.div1 else: return scene_lut[ (r.div1, r.div2) ] df['Div'] = df.apply(lambda row: assignDiv(df, row), axis=1) def fix_zero(s): if s=="0": s="_" return s def assignScene(df, r): d = r.Div print d #print inv_scene_lut s = inv_scene_lut[ d ] if d<10: d = ' '+str(d) else: d = str(d) return d + ' (' + str(r.Act) + '.'+ fix_zero(str(s)) + ')' df['Scene'] = df.apply(lambda row: assignScene(df, row), axis=1) print df df.to_csv(play+'_df.csv', header=True, index=False, encoding='utf-8') #------------------------------------------------------------------------------- # Fix Subset names .... #df['Subset'] = df['Subset'].replace('Full-a',' Full-XML') df['Subset'] = df['Subset'].replace('Full',' Full-XML') #df['Subset'] = df['Subset'].replace('Common','b) Common') #df['Subset'] = df['Subset'].replace('Only','c) Only') #------------------------------------------------------------------------------- ''' def calcWordsPerVerseLine(r): return float(r['#words']) / float(r['#v_lines']) def calcWordsPerProseLine(r): return float(r['#words']) / float(r['#p_lines']) df['#w_per_vl'] = df.apply(lambda row: calcWordsPerVerseLine(row), axis=1) df['#w_per_pl'] = df.apply(lambda row: calcWordsPerProseLine(row), axis=1) ''' #------------------------------------------------------------------------------- # Pairs Plots ... ''' def pmi(dff, x, y): df = dff.copy() df['f_x'] = df.groupby(x)[x].transform('count') df['f_y'] = df.groupby(y)[y].transform('count') df['f_xy'] = df.groupby([x, y])[x].transform('count') df['pmi'] = np.log(len(df.index) * df['f_xy'] / (df['f_x'] * df['f_y']) ) return df df_ = df.copy() df_ = pmi(df_, 'Speaker', 'which_rel') df_['dataset'] = df['Edition'] + ' ' + df['Subset'] what = ['pmi', 'v2p'] # , '#v_words', '#p_words'] cats = ['dataset', 'Scene'] # Set categorical data to numeric ... for c in cats: df_[c] = pd.Categorical(df_[c]) df_[c] = df_[c].cat.codes df_ = df_[ ['Speaker'] + cats + what ] print df_ import seaborn as sns sns.set(style="ticks", color_codes=True) dataset_colors = [ 'royalblue', 'lightcoral', 'blue', 'red', 'slateblue', 'coral'] #sns.pairplot(df_, hue="dataset", palette=dataset_colors) # "husl") sns.pairplot(df_, hue="Speaker", palette="husl") plt.show() exit() print df_ pairs(df_) exit() ''' #------------------------------------------------------------------------------- # X by Y by Z ... # words, lines, speeches, speakers # scene, act, speaker # count, mean, median def expand(xs): s=xs[0] for x in xs[1:]: s = s + ' and ' + x return s def processMetric(metric, x,y,z, aggFun): #title = 'Words by Act' title = metric + ' for each ' + expand(y) + ' by ' + expand(z) print title df_ = df if z==['Edition']: df_ = df_[ df_['Subset'] == ' Full-XML'] pt = df_.pivot_table( values=x , columns=y , index=z , aggfunc=aggFun , fill_value=0 ) pt = pt.T filename = title bs = y[0] if bs in ['Scene', 'Speaker'] : rot = 90 else: rot = 0 mkPlot(filename, title, bs, metric, pt, rot, len(z) ) # Calc table w margings ... pt = df_.pivot_table( values=x , columns=y , index=z , aggfunc=aggFun , fill_value=0 , margins=True ) pt = pt.T print pt print '='*30 outputTable(filename, pt) def count_uniq(xs): return len(xs.dropna().unique()) metrics0 = [ \ ('Number of Prose Words','#p_words', np.sum) #, ('Mean entropy of speeches', 'ent', np.mean) #, ('Median entropy of speeches', 'ent', np.median) ] metrics = \ [ ('Mean of verse words per line minus prose words per line', 'v2p', np.mean) , ('Median of verse words per line minus prose words per line', 'v2p', np.median) , ('Number of Words','#words', np.sum) , ('Number of Prose Words','#p_words', np.sum) , ('Number of Verse Words ','#v_words', np.sum) , ('Number of Prose Lines','#p_lines', np.sum) , ('Number of Verse Lines','#v_lines', np.sum) , ('Mean Words per Speech','#words', np.mean) , ('Median Words per Speech','#words', np.median) #, ('Number of Prose Lines','#p_lines', np.sum) , ('Mean Prose Lines per Speech','#p_lines', np.mean) , ('Median Prose Lines per Speech','#p_lines', np.median) #, ('Number of Verse Lines','#v_lines', np.sum) , ('Mean Verse Lines per Speech','#v_lines', np.mean) , ('Median Verse Lines per Speech','#v_lines', np.median) , ('Number of Speeches','sp#', len) , ('Number of Speakers','who_', count_uniq) #, ('Number of words per prose line', '#w_per_pl', np.sum) , ('Mean number of words per prose line', '#w_per_pl', np.mean) , ('Median number of words per prose line', '#w_per_pl', np.median) #, ('Number of words per verse line', '#w_per_vl', np.sum) , ('Mean number of words per verse line', '#w_per_vl', np.mean) , ('Median number of words per verse line', '#w_per_vl', np.median) , ('Mean entropy of speeches', 'ent', np.mean) , ('Median entropy of speeches', 'ent', np.median) ] if doMetrics: for x in metrics: for y in ['Act', 'Scene', 'Speaker']: for z in [['Edition']]: # , ['Edition','Subset']]: t,x_, aggFun = x if not (y=='Speaker' and x_=='who_'): processMetric(t , [x_] , [y] , z, aggFun) #------------------------------------------------------------------------------- # TKL words by Act/Scene ... def processTKLWord(w, div, subtype): title = w + ' for each ' + div + ' by ' + expand(subtype) print title df_ = df if subtype==['Edition']: df_ = df_[ df_['Subset'] == ' Full-XML'] tot = df_.pivot_table( values= ['#words'] , columns=[div] , index=subtype , aggfunc=np.sum , fill_value=0 ) # Make Proportions relative to full xml counts for each edition ... if 0: if not subtype==['Edition']: a = np.array([np.array(tot)[0]]).T a = np.append(a,a, axis=1) a = np.append(a,a, axis=1) a = a.T a = pd.DataFrame(a[:3]) b = np.array([np.array(tot)[3]]).T b = np.append(b,b, axis=1) b = np.append(b,b, axis=1) b = b.T b = pd.DataFrame(b[:3]) m = np.append(a,b, axis=0) tot[tot.columns] = m print tot exit() # Make Proportions relative to mean of full xml counts for each edition ... if 0: if not subtype==['Edition']: df_F = df[ (df.Edtn == 'F') & (df.Subset== ' Full-XML') ] num_f_words = np.sum(df_F['#words']) df_Q = df[ (df.Edtn[:1] == ' ') & (df.Subset== ' Full-XML') ] print df_Q exit() num_q_words = np.sum(df_Q['#words']) mean_num_words = (num_f_words + num_q_words) / 2 tot = tot*0 + mean_num_words #print tot #exit() pt = df_.pivot_table( values= [w] , columns=[div] , index=subtype , aggfunc=np.sum , fill_value=0 ) print pt print tot prop_pt = (pt / tot.values) * 100 prop_pt = prop_pt.T print prop_pt filename = title title = 'Proportion of \'' + w + '\' for each ' + div + ' by ' + expand(subtype) mkPlot(filename, title, div, 'Proportion of Word Tokens', prop_pt, 0 , len(subtype)) pt.columns = pt.columns.set_levels(['count'], level=0) outputTable(filename+'_count', pt.T) tot.columns = tot.columns.set_levels(['token group total'], level=0) outputTable(filename+'_tot', tot.T) prop_pt = prop_pt.T prop_pt.columns = prop_pt.columns.set_levels(['proportion'], level=0) prop_pt = prop_pt.T outputTable(filename+'_prop', prop_pt) if doTKL: for div in ['Act', 'Scene']: #for subtype in [['Edition'], ['Edition', 'Subset']]: for subtype in [['Edition', 'Subset']]: #for w in ['which_rel']: for w in all_tkl_word_tokens: processTKLWord(w, div, subtype) #------------------------------------------------------------------------------- # Calc does to doth Ratio title = 'Ratio of Does to Doth by Act' print title #df['doesDothRatio'] = df['does'].astype(float) / df['doth'].astype(float) pt_does = df.pivot_table( values=['does'] , columns=['Act'] , index=['Edition'] , aggfunc=np.sum , fill_value=0 ) pt_doth = df.pivot_table( values=['doth'] , columns=['Act'] , index=['Edition'] , aggfunc=np.sum , fill_value=0 ) pt = pt_does / pt_doth.values pt = pt.T print pt filename = title mkPlot(filename, title, 'Act', 'Ratio', pt, 0, 1) outputTable(filename, pt) #------------------------------------------------------------------------------- # Heatmaps import seaborn as sns; sns.set() sns.set(font_scale = .8) # Number of words ... def hmap(filename, title, xlab, ylab, pt, pal): ax = plt.axes() sns.heatmap(pt, ax = ax, center=0.0, annot=True, annot_kws={'size':5} , fmt='.1f', linewidths=0.4, cmap=pal) #except: pass # Fix y labels labels = [item.get_text()[7:] for item in ax.get_yticklabels()] ax.set_yticklabels(labels) ax.set_title(title, fontsize=12) plt.xlabel(xlab, fontsize=10) plt.ylabel(ylab, fontsize=10) plt.tight_layout() filename = filename.replace(' ', '_') plt.savefig(play+'_data/'+filename+'.png') plt.close() return pt def procHeatMap(ed, df): title = 'Number of words'+' '+ed print title pt = df.pivot_table( values=['#words'] , columns=['Speaker'] , index=['Scene'] , aggfunc=np.sum , fill_value=0 ) #pt = df.pivot('Speaker', 'Scene', '#words') pt = pt.T #print pt #exit() filename = title try: hmap(filename, title, 'Scene', 'Speaker', pt, 'YlOrRd') except: pass outputTable(filename, pt) pt = pd.DataFrame(pt) #pt.columns = pt.columns.droplevel() return pt folio = df['Edtn'] == 'F' df_F = df[folio] pt_F = procHeatMap('F', df_F) print pt_F quarto = df['Edtn'] != 'F' df_Q = df[quarto] pt_Q = procHeatMap('Q', df_Q) print pt_Q pt = pt_Q - pt_F #pt[17] = pt_Q[17] print pt #exit() # Find elements that are both 0 ... a = pt_Q + pt_F #a[17] = pt_Q[17] a[ a==0 ] = np.nan a[ a>0 ] = 1. pt = pt * a print pt title = 'Number of words Q - F' filename = title try: hmap(filename, title, 'Scene', 'Speaker', pt, 'RdBu_r') except: pass outputTable(filename, pt)