Ich bin auf der Suche nach dem Gutenberg-Katalog verfügbar here mit Python. Ich bin erfahren im Web Scraping und Parsing HTML, aber dieses Format entzieht sich mir. Ich habe versucht, bei Verwendung von RDFlib die lxml etree und unter Versuch mit:Parsing RDF von Gutenberg in Python
path = 'epub/10/pg%s.rdf'
g = rdflib.Graph()
g.parse(path)
s = g.serialize(format='nt')
print(g)
Ich bin für die verschiedenen Metadatenwerte (Titel, Autor, Gutenberg-URL, etc.) suchen. Ich füge unten eine Beispieldatei bei.
<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xml:base="http://www.gutenberg.org/"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:cc="http://web.resource.org/cc/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:pgterms="http://www.gutenberg.org/2009/pgterms/"
xmlns:dcam="http://purl.org/dc/dcam/"
>
<cc:Work rdf:about="">
<cc:license rdf:resource="http://www.gnu.org/licenses/gpl.html"/>
<rdfs:comment>Archives containing the RDF files for *all* our books can be downloaded at
http://www.gutenberg.org/wiki/Gutenberg:Feeds#The_Complete_Project_Gutenberg_Catalog</rdfs:comment>
</cc:Work>
<pgterms:ebook rdf:about="ebooks/100">
<dcterms:title>The Complete Works of William Shakespeare</dcterms:title>
<pgterms:bookshelf>
<rdf:Description rdf:nodeID="Ncc8361d84fc142969cf27b77ac8d0c24">
<rdf:value>Plays</rdf:value>
<dcam:memberOf rdf:resource="2009/pgterms/Bookshelf"/>
</rdf:Description>
</pgterms:bookshelf>
<dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">1994-01-01</dcterms:issued>
<dcterms:publisher>Project Gutenberg</dcterms:publisher>
<dcterms:rights>Copyrighted. Read the copyright notice inside this book for details.</dcterms:rights>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/100/100.txt">
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">5589917</dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2014-08-29T12:08:52</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N19fd61f986a94cc18f5dce9ed07e8517">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=us-ascii</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:license rdf:resource="license"/>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.kindle.images">
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:format>
<rdf:Description rdf:nodeID="N0ee902d343e44cb5a8f639fa55fc6334">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">9509392</dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-01T01:18:40.171080</dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:subject>
<rdf:Description rdf:nodeID="N0e2195113aa34bf7abfe001edf6a03a2">
<rdf:value>English drama -- Early modern and Elizabethan, 1500-1600</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
</rdf:Description>
</dcterms:subject>
<dcterms:creator>
<pgterms:agent rdf:about="2009/agents/65">
<pgterms:name>Shakespeare, William</pgterms:name>
<pgterms:birthdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1564</pgterms:birthdate>
<pgterms:deathdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1616</pgterms:deathdate>
<pgterms:alias>Shakspeare, William</pgterms:alias>
<pgterms:webpage rdf:resource="http://en.wikipedia.org/wiki/William_Shakespeare"/>
<pgterms:alias>Shakspere, William</pgterms:alias>
</pgterms:agent>
</dcterms:creator>
<dcterms:subject>
<rdf:Description rdf:nodeID="Ncb26996951d44761901e30445fc8a9dc">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCC"/>
<rdf:value>PR</rdf:value>
</rdf:Description>
</dcterms:subject>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/100/100.zip">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2035857</dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="Nb4f5881241fd42e9a0f8a07cb1462008">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/zip</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:format>
<rdf:Description rdf:nodeID="Nc3c66052298f482488fb8f13215f92ba">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=us-ascii</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2014-08-29T12:09:20</dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<pgterms:downloads rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4605</pgterms:downloads>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.epub.noimages">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2376083</dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-01T01:18:13.998200</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N9dc27629e3164dba98c659dcaf47c7fe">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip</rdf:value>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.html.noimages">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">6944416</dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-01T01:18:00.715792</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N7140e760a0f14ae4ba4b027bd7f7f4f6">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.kindle.noimages">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">9509383</dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N34666f5ebdd8461ca1c6b8cfba5103e5">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-01T01:19:07.134922</dcterms:modified>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.epub.images">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2376084</dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:format>
<rdf:Description rdf:nodeID="N1e32eb8531504d378e05acb6440d24b0">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-01T01:18:09.062427</dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.rdf">
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-28T05:00:49.076168</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N1d915c961af44ab7ac9c71e7ec068bde">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/rdf+xml</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">11275</dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:language>
<rdf:Description rdf:nodeID="N5ff08142477c4bfeb3bac48c18ba23a4">
<rdf:value rdf:datatype="http://purl.org/dc/terms/RFC4646">en</rdf:value>
</rdf:Description>
</dcterms:language>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.txt.utf-8">
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-01T01:17:42.102580</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N98845b3d16bd42d787e9d7cba42bf44b">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">5589889</dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:type>
<rdf:Description rdf:nodeID="N47bb369dd96248ffb1f412145cdb0713">
<rdf:value>Text</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/DCMIType"/>
</rdf:Description>
</dcterms:type>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.html.images">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">6944416</dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-04-01T01:17:55.634002</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="Nd1733441ad824cff97a5d9ad50f0307b">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
</pgterms:file>
</dcterms:hasFormat>
</pgterms:ebook>
<rdf:Description rdf:about="http://en.wikipedia.org/wiki/William_Shakespeare">
<dcterms:description>Wikipedia</dcterms:description>
</rdf:Description>
</rdf:RDF>
Welches Element * genau * benötigen Sie Hilfe, um davon zu kommen Beispiel-XML? Hoffentlich, wenn Sie ein Beispiel für die Auswahl von einem oder zwei Elementen wählen, können Sie einen Weg finden, den Rest auszuwählen. – har07