Ich benutze Arabisch Wortnetz, um Synonyme zu erhalten, ich möchte es in meinen Code integrieren, um die Matrix der Ähnlichkeit zu erhalten. Die Idee ist, wenn jemand das Synonym des Wortes verwendet, gibt es 1, was gemein ist, ähnlich. so gibt es den Code:Get Synonyme mit Awn
# -*- coding: utf-8 -*-
from numpy import zeros
from scipy.linalg import svd
from math import log
from numpy import asarray, sum
#from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
#from nltk.stem import PorterStemmer
#from nltk.stem.isri import ISRIStemmer
#import nltk
#from matplotlib import pyplot as plt
from snowballstemmer import stemmer
from AWNDatabaseManagement import wn
titles = [
u" ذهبت الاخت الى المدرسة",u"تقع المدرسة في الجبال",
u"ذهب الام لزيارة ابنتها في المدرسة ",
u"تحضر الام الكعكة" ]
ar_stemmer = stemmer("arabic")
stopwords = [
u'ثم',
u'و',
u'حتى',
u'الى',
u'على',
u'في'
]
ignorechars = ''',:'!'''
class LSA(object):
def __init__(self, stopwords, ignorechars):
self.stopwords = stopwords
self.ignorechars = ignorechars
self.wdict = {}
self.dcount = 0
def parse(self, doc):
for word in doc.split(" "):
stem = ar_stemmer.stemWord(word.strip())
#synsets = wn.get_synsetids_from_word(stem)
#for s in synsets:
#wn._items[s].describe()
if not stem or stem in self.stopwords:
continue
elif stem in self.wdict:
self.wdict[stem].append(self.dcount)
else:
self.wdict[stem] = [self.dcount]
self.dcount += 1
def build(self):
self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 1]
self.keys.sort()
self.A = zeros([len(self.keys), self.dcount])
for i, k in enumerate(self.keys):
for d in self.wdict[k]:
self.A[i,d] += 1
Und was ich will, ist zu setzen, an der Stelle des راح I Stamm in der Klasse Parse setzen.
from AWNDatabaseManagement import wn
synsets = wn.get_synsetids_from_word(u"رَاحَ")
print synsets
for s in synsets:
wn._items[s].describe()