#!usr/bin/python
# -*- coding: utf-8 -*-
# --------------------------------------------------
# File Name: analyseXML.py
# Location:
# Purpose:
# Creation Date: 06-07-2017
# Last Modified: Thu, Jan 25, 2018 5:38:23 PM
# Author(s): Mike Stout
# Copyright 2017 The Author(s) All Rights Reserved
# Credits:
# --------------------------------------------------
import codecs
import xml.dom.minidom as minidom
import sys
import re
prefix, play, f_filename, q_filename = sys.argv[1:]
doTKL = 1
doMetrics = 1
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('chop_threshold', 0)
pd.set_option('precision', 4)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.max_colwidth', -1)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
six_colors = ['red', 'coral', 'lightcoral', 'blue', 'slateblue', 'royalblue']
two_colors = ['red', 'blue']
#f = sys.argv[1]
def hack_utf8_to_ascii(s):
return s.encode('ascii', 'ignore')
def words(s):
return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\'~%]', s))
def words_(s):
return filter(lambda x: x!='', re.split('[ \n\t]', s))
'''
text='hello,how are you?I am fine,thank you. And you?'
print text
print words(text)
exit()
'''
tkl_word_groups = map(words,
[ 'which_rel that who'
, 'do does doth'
, 'these this those'
, 'thine thy'
])
for g in tkl_word_groups: print g
def getTklWordGroup(w):
def ok(g):
return w in g
return filter(ok, tkl_word_groups)[0]
all_tkl_word_tokens = [ w for kws in tkl_word_groups for w in kws ]
function_words = \
[ 'for'
, 'in'
]
def isFunctionWord(w):
#res = w in function_words
res = any( [ w in all_tkl_word_tokens ] )
return res
#N=int(sys.argv[1])
def isProse(s): return (s!=' ' and '~' not in s)
def isVerse(s): return (s!=' ' and '~' in s)
def getVerseLines(txt):
txt_ = re.split('[@#%]',txt)
xs = filter(lambda x: isVerse(x), txt_)
'''
if "Courtesan" in txt:
print xs
print len(xs)
print map(len,map(words,xs))
#exit()
'''
return xs
def getProseLines(txt):
txt_ = re.split('[@#%]',txt)
#print txt_
xs = filter(lambda x: isProse(x), txt_)
'''
if "Courtesan" in txt:
print xs
print len( xs)
print map(len,map(words,xs))
exit()
'''
return xs
def getVerseWords(txt):
xs = ' '.join(getVerseLines(txt))
return words(xs)
def getProseWords(txt):
xs = ' '.join(getProseLines(txt))
return words(xs)
def divide(a,b):
try: res = float(a) / float(b)
except: res = 0
return res
# https://stackoverflow.com/questions/15450192/fastest-way-to-compute-entropy-in-python
def entropy(labels):
prob_dict = {x:labels.count(x)/float(len(labels)) for x in labels}
probs = np.array(list(prob_dict.values()))
return - probs.dot(np.log2(probs))
def analyse(a):
txt = a[-1]
num_chars = len(txt)
ws = words(txt)
#k = len(ws)
#if k==N: # ws[0]=='Fathom':
# print ws
# #exit()
num_words = len(ws)
# Only count verse lines (delimited by '@') ...
verse_lines = getVerseLines(txt)
num_verse_lines = len(verse_lines) #or 1
verse_words = getVerseWords(txt)
num_verse_words = len(verse_words) #or 1
w_per_vl = divide(num_verse_words, num_verse_lines)
# Only count prose lines (delimited by '#') ...
prose_lines = getProseLines(txt)
#print "PROSE", prose_lines
num_prose_lines = len(prose_lines) #or 1
prose_words = getProseWords(txt)
num_prose_words = len(prose_words) #or 1
w_per_pl = divide(num_prose_words, num_prose_lines)
func_words = filter(isFunctionWord, ws)
num_func_words = float(len(func_words))
#print num_prose_lines, num_prose_words, num_verse_lines, num_verse_words, txt
#v2p = float(num_verse_words) - float(num_prose_words)
v2p = float(w_per_vl) - float(w_per_pl)
ent = entropy(ws)
tkl_analysis = []
for w in all_tkl_word_tokens:
pred = lambda x: w.lower() in x.lower()
f_words = filter(pred, ws)
num_f_words = float(len(f_words))
tkl_analysis.append(num_f_words)
return [num_chars, num_words, num_verse_words, num_prose_words, num_verse_lines, num_prose_lines, v2p, w_per_vl, w_per_pl, num_func_words, ent] + tkl_analysis
#-------------------------------------------------------------------------------
# Make Canonical Name LUT ...
with open(prefix+"/"+play+'/'+play+'-names.txt', 'r') as fh:
name_lut = []
i=0
for line in fh:
if i==0: _,col1,col2 = line[:-1].split('\t')
if i>0 and len(line)>1:
n,q,f = line.split('\t')
qs = q.split(',')
fs = f.split(',')
for k in qs: name_lut.append((col1+'_'+k.strip(),n))
for k in fs: name_lut.append((col2+'_'+k.strip(),n))
i+=1
name_lut = dict(name_lut)
print name_lut
nameIds = dict([ (k,i) for i,k in enumerate(name_lut.keys() ) ])
print nameIds
#exit()
#-------------------------------------------------------------------------------
# Make Canonical Scene Act LUT ...
with open(prefix+"/"+play+'/'+play+'-scenes.txt', 'r') as fh:
scene_lut = []
inv_scene_lut = []
act_lut = []
i=0
for line in fh:
if i>0 and line[0] in ['P','-']+map(str,xrange(10)):
q,f = line[:-1].split('\t')
q_str = q
print ">", q,f
if q=="Prologue": q = -1
elif q=="Epilogue": q = 99
elif q=="-": q = 0
elif "." in q_str: q = i
else: q = int(q)
if f=="-": a,f = -1,-1
elif f=="Epilogue": a,f = 99,99
elif f=="Prologue": a,f = -1,-1
elif "." not in f:
a = 1
f = int(f)
else: a,f = map(int, f.split('.') )
scene_lut.append(( (a,f),q))
inv_scene_lut.append((q,f))
act_lut.append(( q, a) )
i+=1
scene_lut = dict(scene_lut)
print scene_lut
inv_scene_lut = dict(inv_scene_lut)
print inv_scene_lut
act_lut = dict(act_lut)
print act_lut
#exit()
#-------------------------------------------------------------------------------
#which_as_relative =
# '<seg subtype='relative' type='homograph'>which_rel</seg>'
def get_all_text( node ):
# do not include stage directions ...
if node.nodeName == 'stage': return ''
if node.nodeType == node.TEXT_NODE:
return node.data
else:
txt = ''
for child_node in node.childNodes:
# to start indicate additional text ...
if child_node.nodeName == 'add': txt += '>'
# Add a delimter '@' for verse lines ...
if child_node.nodeName == 'l': txt += '@~'
# Add a delimter '%' for prose ...
if child_node.nodeName == 'p': txt += '%'
# Add a delimter '#' for prose ...
if child_node.nodeName == 'lb': txt += '#'
# Parse which as a rleative ...
if child_node.nodeName == 'seg' \
and child_node.attributes['type'].value=='homograph'\
and child_node.attributes['subtype'].value=='relative' \
and child_node.firstChild.nodeValue.lower() == 'which':
child_node.firstChild.replaceWholeText('which_rel')
t = get_all_text( child_node )
t = get_all_text( child_node )
# to indicate end of additional text ...
if child_node.nodeName == 'add': t += '<'
txt += ' '+t+' '
txt = hack_utf8_to_ascii(txt.replace('\n',' '))
txt = ' '.join(words_(txt))
return txt
def getSpeaker(xml):
for el in xml.getElementsByTagName('speaker'):
return get_all_text(el)
def getSps(ed, xml):
sps = []
for n,el in enumerate(xml.getElementsByTagName('sp')):
who = el.getAttribute('who')
s = getSpeaker(el)
speaker_ = name_lut[ed+'_'+who]
t = get_all_text(el)[len(s):]
'''
#except:
if 0:
print n, who, speaker_
print el.toxml()
exit()
'''
sps.append([n+1, who, speaker_, s, t])
return sps
def fixSplitSpeeches(ed, act, scene, sps):
pairs = zip(sps, sps[1:])
with open(ed+'_splitSpeeches.txt', 'w') as text_file:
for a,b in pairs:
if a[1]==b[1]: # ... the "who" elements match?
s = '\n\n'+ ','.join(map(str, [ed,act,scene]))
text_file.write(s)
s = '\n'+ ','.join(map(str, a))
text_file.write(s)
s = '\n'+ ','.join(map(str, b))
text_file.write(s)
#exit()
return sps
def getScenes(ed, act, subset, xml):
scs = []
for el in xml.getElementsByTagName('div2'):
if el.getAttribute('type') == 'scene':
scene = el.getAttribute('n')
print 'scene: ', n
sps = getSps(ed, el)
# Find speeches split by stage directions ....
if subset not in ["Only", "Common"]:
sps = fixSplitSpeeches(ed, act, scene, sps)
for sp in sps:
scs.append([int(scene)] + sp)
return scs
def proc(f):
global play
#play = 'Ham' # ../dissectXMLadditions/'
xml = minidom.parse(prefix+'/'+play+'/'+f)
play, ed, Subset = f.split('.')[0].split('_')
print ed, Subset
acts = []
for el in xml.getElementsByTagName('div1'):
if el.getAttribute('type') == 'act':
act = el.getAttribute('n')
print 'act: ', act
scs = getScenes(ed, act, Subset, el)
for sc in scs:
j=sc[0]
acts.append([ ed, Subset, int(act)] + sc)
if el.getAttribute('type') == 'scene':
n = el.getAttribute('n')
sps = getSps(ed, el)
for sp in sps:
acts.append([ed, Subset, int(n), 0] + sp)
print acts
#exit()
return acts
#-------------------------------------------------------------------------------
res = []
rs = []
#for f in ['F_HC.xml', 'Q_HC.xml']:
#for f in 'F_plus.xml F_minus.xml Q_plus.xml Q_minus.xml'.split(' '):
for f in [f_filename, q_filename]:
print f
rs = proc(f)
print len(rs)
for r in rs:
stats = analyse(r)
res.append(stats + r)
#exit()
df = pd.DataFrame(res)
metric_names = ['#chars','#words'
, '#v_words', '#p_words'
, '#v_lines', '#p_lines'
, 'v2p', '#w_per_vl'
, '#w_per_pl', '#f_words'
, 'ent'
]
colNames = metric_names + \
all_tkl_word_tokens + \
['Edtn','Subset', 'div1', 'div2','sp#','who_','Speaker', 'spkr','txt']
df.columns = colNames
# fix numeric sorting of columns ...
for c in ['div1', 'div2','sp#']:
i = colNames.index(c)
df[df.columns[i]] = df[df.columns[i]].astype(int)
# Fix Ordering of editions ...
def fixEd(r):
if r.Edtn=='F': return 'F'
else: return ' '+r.Edtn
df['Edition'] = df.apply(lambda row: fixEd(row), axis=1)
#-------------------------------------------------------------------------------
def mkPlot(filename, title, xlab, ylab, df, rot, k):
colors = two_colors
if k==2: colors = six_colors
df.plot(kind='bar', width=.9, rot=rot, color=colors) # .legend()
plt.suptitle(title, fontsize=12)
plt.legend(prop={'size': 6})
plt.xlabel(xlab, fontsize=6)
plt.ylabel(ylab, fontsize=10)
bars = [ x[1] for x in df.index.tolist()] # rows[0] # rowNames # [0]
#except: bars = xrange(5)
plt.tick_params(axis=u'both', which=u'both',length=0)
plt.xticks(range(len(bars)), bars, size='small')
#plt.show()
plt.tight_layout()
filename = filename.replace(' ', '_')
plt.savefig(play+'_data/'+filename+'.png')
plt.close()
def outputTable(filename, df):
filename = filename.replace(' ', '_')
html = df.to_html()
with open(play+'_data/'+filename+'.txt', 'w') as text_file:
text_file.write(html)
# Calulate Proportions...
#-------------------------------------------------------------------------------
# Assign F Acts to Q
def assignAct(df, r):
if play=="Rom": return 1
else:
if r.Edtn == 'F': return r.div1
else:
if r.div2: return r.div1
else: return act_lut[ r.div1 ]
df['Act'] = df.apply(lambda row: assignAct(df, row), axis=1)
#-------------------------------------------------------------------------------
# Fix F div# 17 -> Q div# 18
print scene_lut
def assignDiv(df, r):
if play=="Rom":
return scene_lut[(1, r.div1)]
if r.Edtn[:1] == 'Q':
if r.div2:
return scene_lut[(r.div1, r.div2)]
else:
return r.div1
else: return scene_lut[ (r.div1, r.div2) ]
df['Div'] = df.apply(lambda row: assignDiv(df, row), axis=1)
def fix_zero(s):
if s=="0": s="_"
return s
def assignScene(df, r):
d = r.Div
print d
#print inv_scene_lut
s = inv_scene_lut[ d ]
if d<10: d = ' '+str(d)
else: d = str(d)
return d + ' (' + str(r.Act) + '.'+ fix_zero(str(s)) + ')'
df['Scene'] = df.apply(lambda row: assignScene(df, row), axis=1)
print df
df.to_csv(play+'_df.csv', header=True, index=False, encoding='utf-8')
#-------------------------------------------------------------------------------
# Fix Subset names ....
#df['Subset'] = df['Subset'].replace('Full-a',' Full-XML')
df['Subset'] = df['Subset'].replace('Full',' Full-XML')
#df['Subset'] = df['Subset'].replace('Common','b) Common')
#df['Subset'] = df['Subset'].replace('Only','c) Only')
#-------------------------------------------------------------------------------
'''
def calcWordsPerVerseLine(r):
return float(r['#words']) / float(r['#v_lines'])
def calcWordsPerProseLine(r):
return float(r['#words']) / float(r['#p_lines'])
df['#w_per_vl'] = df.apply(lambda row: calcWordsPerVerseLine(row), axis=1)
df['#w_per_pl'] = df.apply(lambda row: calcWordsPerProseLine(row), axis=1)
'''
#-------------------------------------------------------------------------------
# Pairs Plots ...
'''
def pmi(dff, x, y):
df = dff.copy()
df['f_x'] = df.groupby(x)[x].transform('count')
df['f_y'] = df.groupby(y)[y].transform('count')
df['f_xy'] = df.groupby([x, y])[x].transform('count')
df['pmi'] = np.log(len(df.index) * df['f_xy'] / (df['f_x'] * df['f_y']) )
return df
df_ = df.copy()
df_ = pmi(df_, 'Speaker', 'which_rel')
df_['dataset'] = df['Edition'] + ' ' + df['Subset']
what = ['pmi', 'v2p'] # , '#v_words', '#p_words']
cats = ['dataset', 'Scene']
# Set categorical data to numeric ...
for c in cats:
df_[c] = pd.Categorical(df_[c])
df_[c] = df_[c].cat.codes
df_ = df_[ ['Speaker'] + cats + what ]
print df_
import seaborn as sns
sns.set(style="ticks", color_codes=True)
dataset_colors = [ 'royalblue', 'lightcoral', 'blue', 'red', 'slateblue', 'coral']
#sns.pairplot(df_, hue="dataset", palette=dataset_colors) # "husl")
sns.pairplot(df_, hue="Speaker", palette="husl")
plt.show()
exit()
print df_
pairs(df_)
exit()
'''
#-------------------------------------------------------------------------------
# X by Y by Z ...
# words, lines, speeches, speakers
# scene, act, speaker
# count, mean, median
def expand(xs):
s=xs[0]
for x in xs[1:]: s = s + ' and ' + x
return s
def processMetric(metric, x,y,z, aggFun):
#title = 'Words by Act'
title = metric + ' for each ' + expand(y) + ' by ' + expand(z)
print title
df_ = df
if z==['Edition']:
df_ = df_[ df_['Subset'] == ' Full-XML']
pt = df_.pivot_table(
values=x
, columns=y
, index=z
, aggfunc=aggFun
, fill_value=0
)
pt = pt.T
filename = title
bs = y[0]
if bs in ['Scene', 'Speaker'] : rot = 90
else: rot = 0
mkPlot(filename, title, bs, metric, pt, rot, len(z) )
# Calc table w margings ...
pt = df_.pivot_table(
values=x
, columns=y
, index=z
, aggfunc=aggFun
, fill_value=0
, margins=True
)
pt = pt.T
print pt
print '='*30
outputTable(filename, pt)
def count_uniq(xs):
return len(xs.dropna().unique())
metrics0 = [ \
('Number of Prose Words','#p_words', np.sum)
#, ('Mean entropy of speeches', 'ent', np.mean)
#, ('Median entropy of speeches', 'ent', np.median)
]
metrics = \
[ ('Mean of verse words per line minus prose words per line', 'v2p', np.mean)
, ('Median of verse words per line minus prose words per line', 'v2p', np.median)
, ('Number of Words','#words', np.sum)
, ('Number of Prose Words','#p_words', np.sum)
, ('Number of Verse Words ','#v_words', np.sum)
, ('Number of Prose Lines','#p_lines', np.sum)
, ('Number of Verse Lines','#v_lines', np.sum)
, ('Mean Words per Speech','#words', np.mean)
, ('Median Words per Speech','#words', np.median)
#, ('Number of Prose Lines','#p_lines', np.sum)
, ('Mean Prose Lines per Speech','#p_lines', np.mean)
, ('Median Prose Lines per Speech','#p_lines', np.median)
#, ('Number of Verse Lines','#v_lines', np.sum)
, ('Mean Verse Lines per Speech','#v_lines', np.mean)
, ('Median Verse Lines per Speech','#v_lines', np.median)
, ('Number of Speeches','sp#', len)
, ('Number of Speakers','who_', count_uniq)
#, ('Number of words per prose line', '#w_per_pl', np.sum)
, ('Mean number of words per prose line', '#w_per_pl', np.mean)
, ('Median number of words per prose line', '#w_per_pl', np.median)
#, ('Number of words per verse line', '#w_per_vl', np.sum)
, ('Mean number of words per verse line', '#w_per_vl', np.mean)
, ('Median number of words per verse line', '#w_per_vl', np.median)
, ('Mean entropy of speeches', 'ent', np.mean)
, ('Median entropy of speeches', 'ent', np.median)
]
if doMetrics:
for x in metrics:
for y in ['Act', 'Scene', 'Speaker']:
for z in [['Edition']]: # , ['Edition','Subset']]:
t,x_, aggFun = x
if not (y=='Speaker' and x_=='who_'):
processMetric(t , [x_] , [y] , z, aggFun)
#-------------------------------------------------------------------------------
# TKL words by Act/Scene ...
def processTKLWord(w, div, subtype):
title = w + ' for each ' + div + ' by ' + expand(subtype)
print title
df_ = df
if subtype==['Edition']:
df_ = df_[ df_['Subset'] == ' Full-XML']
tot = df_.pivot_table(
values= ['#words']
, columns=[div]
, index=subtype
, aggfunc=np.sum
, fill_value=0
)
# Make Proportions relative to full xml counts for each edition ...
if 0:
if not subtype==['Edition']:
a = np.array([np.array(tot)[0]]).T
a = np.append(a,a, axis=1)
a = np.append(a,a, axis=1)
a = a.T
a = pd.DataFrame(a[:3])
b = np.array([np.array(tot)[3]]).T
b = np.append(b,b, axis=1)
b = np.append(b,b, axis=1)
b = b.T
b = pd.DataFrame(b[:3])
m = np.append(a,b, axis=0)
tot[tot.columns] = m
print tot
exit()
# Make Proportions relative to mean of full xml counts for each edition ...
if 0:
if not subtype==['Edition']:
df_F = df[ (df.Edtn == 'F') & (df.Subset== ' Full-XML') ]
num_f_words = np.sum(df_F['#words'])
df_Q = df[ (df.Edtn[:1] == ' ') & (df.Subset== ' Full-XML') ]
print df_Q
exit()
num_q_words = np.sum(df_Q['#words'])
mean_num_words = (num_f_words + num_q_words) / 2
tot = tot*0 + mean_num_words
#print tot
#exit()
pt = df_.pivot_table(
values= [w]
, columns=[div]
, index=subtype
, aggfunc=np.sum
, fill_value=0
)
print pt
print tot
prop_pt = (pt / tot.values) * 100
prop_pt = prop_pt.T
print prop_pt
filename = title
title = 'Proportion of \'' + w + '\' for each ' + div + ' by ' + expand(subtype)
mkPlot(filename, title, div, 'Proportion of Word Tokens', prop_pt, 0 , len(subtype))
pt.columns = pt.columns.set_levels(['count'], level=0)
outputTable(filename+'_count', pt.T)
tot.columns = tot.columns.set_levels(['token group total'], level=0)
outputTable(filename+'_tot', tot.T)
prop_pt = prop_pt.T
prop_pt.columns = prop_pt.columns.set_levels(['proportion'], level=0)
prop_pt = prop_pt.T
outputTable(filename+'_prop', prop_pt)
if doTKL:
for div in ['Act', 'Scene']:
#for subtype in [['Edition'], ['Edition', 'Subset']]:
for subtype in [['Edition', 'Subset']]:
#for w in ['which_rel']:
for w in all_tkl_word_tokens:
processTKLWord(w, div, subtype)
#-------------------------------------------------------------------------------
# Calc does to doth Ratio
title = 'Ratio of Does to Doth by Act'
print title
#df['doesDothRatio'] = df['does'].astype(float) / df['doth'].astype(float)
pt_does = df.pivot_table(
values=['does']
, columns=['Act']
, index=['Edition']
, aggfunc=np.sum
, fill_value=0
)
pt_doth = df.pivot_table(
values=['doth']
, columns=['Act']
, index=['Edition']
, aggfunc=np.sum
, fill_value=0
)
pt = pt_does / pt_doth.values
pt = pt.T
print pt
filename = title
mkPlot(filename, title, 'Act', 'Ratio', pt, 0, 1)
outputTable(filename, pt)
#-------------------------------------------------------------------------------
# Heatmaps
import seaborn as sns; sns.set()
sns.set(font_scale = .8)
# Number of words ...
def hmap(filename, title, xlab, ylab, pt, pal):
ax = plt.axes()
sns.heatmap(pt, ax = ax, center=0.0, annot=True, annot_kws={'size':5}
, fmt='.1f', linewidths=0.4, cmap=pal)
#except: pass
# Fix y labels
labels = [item.get_text()[7:] for item in ax.get_yticklabels()]
ax.set_yticklabels(labels)
ax.set_title(title, fontsize=12)
plt.xlabel(xlab, fontsize=10)
plt.ylabel(ylab, fontsize=10)
plt.tight_layout()
filename = filename.replace(' ', '_')
plt.savefig(play+'_data/'+filename+'.png')
plt.close()
return pt
def procHeatMap(ed, df):
title = 'Number of words'+' '+ed
print title
pt = df.pivot_table(
values=['#words']
, columns=['Speaker']
, index=['Scene']
, aggfunc=np.sum
, fill_value=0
)
#pt = df.pivot('Speaker', 'Scene', '#words')
pt = pt.T
#print pt
#exit()
filename = title
try: hmap(filename, title, 'Scene', 'Speaker', pt, 'YlOrRd')
except: pass
outputTable(filename, pt)
pt = pd.DataFrame(pt)
#pt.columns = pt.columns.droplevel()
return pt
folio = df['Edtn'] == 'F'
df_F = df[folio]
pt_F = procHeatMap('F', df_F)
print pt_F
quarto = df['Edtn'] != 'F'
df_Q = df[quarto]
pt_Q = procHeatMap('Q', df_Q)
print pt_Q
pt = pt_Q - pt_F
#pt[17] = pt_Q[17]
print pt
#exit()
# Find elements that are both 0 ...
a = pt_Q + pt_F
#a[17] = pt_Q[17]
a[ a==0 ] = np.nan
a[ a>0 ] = 1.
pt = pt * a
print pt
title = 'Number of words Q - F'
filename = title
try: hmap(filename, title, 'Scene', 'Speaker', pt, 'RdBu_r')
except: pass
outputTable(filename, pt)