Extract Binary Info
How do I extract the binary information of PDFs, audio files, video files, word processor files, spreadsheet files, and presentation program files to a CSV file, or into the Apache Parquet format to work with later?
You can also read and write to Amazon S3 by supplying your AWS credentials, and
using s3a
.
Scala RDD
Will not be implemented.
Scala DF
import io.archivesunleashed._
import io.archivesunleashed.udfs._
sc.setLogLevel("INFO")
sc.hadoopConfiguration.set("fs.s3a.access.key", "YOUR ACCESS KEY")
sc.hadoopConfiguration.set("fs.s3a.secret.key", "YOUR SECRET KEY ")
// Local web archive collection.
val warcs = RecordLoader.loadArchives("/local/path/to/warcs", sc)
// S3 hosted web archive collection.
val warcsS3 = RecordLoader.loadArchives("s3a://your-data-bucket/", sc)
// Choose your format: CSV or Parquet.
// For CSV:
// .write.csv("/path/to/derivatives/csv/audio")
// .write.csv("s3a://your-derivatives-bucket/parquet/pages")
// For Parquet:
// .write.parquet("/path/to/derivatives/parquet/pages/")
// .write.parquet("s3a://your-derivatives-bucket/parquet/pages")
// Audio Files.
warcs.audio()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("/path/to/derivatives/csv/audio")
// Images.
warcsS3.images()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5")
.orderBy(desc("md5"))
.write.parquet("/path/to/derivatives/parquet/image")
// PDFs.
warcs.pdfs()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("s3a://your-derivatives-bucket/csv/pdf")
// Presentation Program Files.
warcs.presentationProgramFiles()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.parquet("s3a://your-derivatives-bucket/parquet/presentation-program")
// Spreadsheets.
warcs.spreadsheets()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("/path/to/derivatives/csv/spreadsheet")
// Videos.
warcs.videos()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("/path/to/derivatives/csv/video")
// Word Processor Files.
warcs.wordProcessorFiles()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.parquet("/path/to/derivatives/parquet/word-processor")
sys.exit
Python DF
from aut import *
# Web archive collection (dataset).
warcs = WebArchive(sc, sqlContext, "/path/to/aut-resources-master/Sample-Data/*gz")
# Choose your format: CSV or Parquet.
# For CSV:
# .write.csv('/path/to/derivatives/csv/audio', header='true')
# Include header='true' if you want headers.
# For Parquet:
# .write.parquet("/path/to/derivatives/parquet/pages/")
# Audio Files.
warcs.audio().write.csv('/path/to/derivatives/csv/audio', header='true')
# Images.
warcs.images().write.parquet('/path/to/derivatives/parquet/images')
# Image Links.
warcs.image_links().write.csv('/path/to/derivatives/csv/images-links', header='true')
# PDFs.
warcs.pdfs().write.parquet('/path/to/derivatives/csv/pdfs')
# Spreadsheets.
warcs.spreadsheets().write.csv('/path/to/derivatives/csv/spreadsheets', header='true')
# Presentation Program Files.
warcs.presentation_program().write.parquet('/path/to/derivatives/csv/presentation_program')
# Videos.
warcs.video().write.parquet('/path/to/derivatives/csv/video')
# Word Processor Files.
warcs.word_processor().write.csv('/path/to/derivatives/csv/word_processor', header='true')