Ich stieß auf den folgenden Code in dem Programm Programmierung kollektive Intelligenz namens newfeatures.py.News Clustering-Programm zeigt keine Links in Python
Hier ist der Code:
import feedparser
import re
feedlist=['http://today.reuters.com/rss/topNews',
'http://today.reuters.com/rss/domesticNews',
'http://today.reuters.com/rss/worldNews',
'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
'http://news.google.com/?output=rss',
'http://feeds.salon.com/salon/news',
'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
'http://rss.cnn.com/rss/edition.rss',
'http://rss.cnn.com/rss/edition_world.rss',
'http://rss.cnn.com/rss/edition_us.rss']
def stripHTML(h):
p=''
s=0
for c in h:
if c=='<': s=1
elif c=='>':
s=0
p+=' '
elif s==0: p+=c
return p
def separatewords(text):
splitter=re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if len(s)>3]
def getarticlewords():
allwords={}
articlewords=[]
articletitles=[]
ec=0
# Loop over every feed
for feed in feedlist:
f=feedparser.parse(feed)
# Loop over every article
for e in f.entries:
# Ignore identical articles
if e.title in articletitles: continue
# Extract the words
txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8'))
words=separatewords(txt)
articlewords.append({})
articletitles.append(e.title)
# Increase the counts for this word in allwords and in articlewords
for word in words:
allwords.setdefault(word,0)
allwords[word]+=1
articlewords[ec].setdefault(word,0)
articlewords[ec][word]+=1
ec+=1
return allwords,articlewords,articletitles
def makematrix(allw,articlew):
wordvec=[]
# Only take words that are common but not too common
for w,c in allw.items():
if c>3 and c<len(articlew)*0.6:
wordvec.append(w)
# Create the word matrix
l1=[[(word in f and f[word] or 0) for word in wordvec] for f in articlew]
return l1,wordvec
from numpy import *
def showfeatures(w,h,titles,wordvec,out='features.txt'):
outfile=file(out,'w')
pc,wc=shape(h)
toppatterns=[[] for i in range(len(titles))]
patternnames=[]
# Loop over all the features
for i in range(pc):
slist=[]
# Create a list of words and their weights
for j in range(wc):
slist.append((h[i,j],wordvec[j]))
# Reverse sort the word list
slist.sort()
slist.reverse()
# Print the first six elements
n=[s[1] for s in slist[0:6]]
outfile.write(str(n)+'\n')
patternnames.append(n)
# Create a list of articles for this feature
flist=[]
for j in range(len(titles)):
# Add the article with its weight
flist.append((w[j,i],titles[j]))
toppatterns[j].append((w[j,i],i,titles[j]))
# Reverse sort the list
flist.sort()
flist.reverse()
# Show the top 3 articles
for f in flist[0:3]:
outfile.write(str(f)+'\n')
outfile.write('\n')
outfile.close()
# Return the pattern names for later use
return toppatterns,patternnames
Die Nutzung ist wie folgt:
>>> import newsfeatures
>>> allw,artw,artt= newsfeatures.getarticlewords()
>>> artt[1]
u'Fatah, Hamas men abducted freed: sources'
Wie Sie sehen können, diese Linie, produziert die Schlagzeile.
>>> artt[1]
u'Fatah, Hamas men abducted freed: sources'
Was will ich weiß, ist, ist es, durch die someway das Programm nicht nur die Überschrift zeigt, sondern zeigt auch die Quelle der Überschrift vom feedlist
.
Konnte jemand helfen?
Dank!
Ändern Sie die 'getarticlewords' Funktion und lesen Sie die [' feedparser' Dokumentation] (http://pythonhosted.org/feedparser/) . Es gibt eine Reihe von Eigenschaften wie 'feed.publisher' und' feed.title' und 'entries [i] .link', die wahrscheinlich enthalten, was Sie wollen. – ChrisP
OMG, dieser schlechte Code ist * wirklich * in einem Buch gewesen?!? –