458 行
14 KiB
PHP
458 行
14 KiB
PHP
<?php
|
|
|
|
namespace CommonsMetadata;
|
|
|
|
use File;
|
|
use ForeignAPIFile;
|
|
use InvalidArgumentException;
|
|
use Language;
|
|
use LocalFile;
|
|
use MediaWiki\MediaWikiServices;
|
|
use ParserOutput;
|
|
use WikiFilePage;
|
|
|
|
/**
|
|
* Class to handle metadata collection and formatting, and manage more specific data extraction
|
|
* classes.
|
|
*/
|
|
class DataCollector {
|
|
|
|
/**
|
|
* Mapping of category names to assesment levels. Array keys are regexps which will be
|
|
* matched case-insensitively against category names; the first match is returned.
|
|
* @var array
|
|
*/
|
|
protected static $assessmentCategories = [
|
|
'poty' => '/^pictures of the year \(.*\)/',
|
|
'potd' => '/^pictures of the day \(.*\)/',
|
|
'featured' => '/^featured (pictures|sounds) on wikimedia commons/',
|
|
'quality' => '/^quality images/',
|
|
'valued' => '/^valued images/',
|
|
];
|
|
|
|
/**
|
|
* Language in which data should be collected. Can be null, which means collect all languages.
|
|
* @var Language
|
|
*/
|
|
protected $language;
|
|
|
|
/**
|
|
* If true, ignore $language and collect metadata in all languages.
|
|
* @var bool
|
|
*/
|
|
protected $multiLang;
|
|
|
|
/** @var TemplateParser */
|
|
protected $templateParser;
|
|
|
|
/** @var LicenseParser */
|
|
protected $licenseParser;
|
|
|
|
/**
|
|
* @param Language $language
|
|
*/
|
|
public function setLanguage( $language ) {
|
|
$this->language = $language;
|
|
}
|
|
|
|
/**
|
|
* @param bool $multiLang
|
|
*/
|
|
public function setMultiLang( $multiLang ) {
|
|
$this->multiLang = $multiLang;
|
|
}
|
|
|
|
/**
|
|
* @param TemplateParser $templateParser
|
|
*/
|
|
public function setTemplateParser( TemplateParser $templateParser ) {
|
|
$this->templateParser = $templateParser;
|
|
}
|
|
|
|
/**
|
|
* @param LicenseParser $licenseParser
|
|
*/
|
|
public function setLicenseParser( LicenseParser $licenseParser ) {
|
|
$this->licenseParser = $licenseParser;
|
|
}
|
|
|
|
/**
|
|
* Collects metadata from a file, and adds it to a metadata array.
|
|
* The array has the following format:
|
|
*
|
|
* '<metadata field name>' => array(
|
|
* 'value' => '<value>',
|
|
* 'source' => '<where did the data come from>',
|
|
* )
|
|
*
|
|
* For fields with multiple values and/or in multiple languages the format is more complex;
|
|
* see the documentation for the extmetadata API.
|
|
*
|
|
* @param array &$previousMetadata metadata collected so far;
|
|
* new metadata will be added to this array
|
|
* @param File $file
|
|
*/
|
|
public function collect( array &$previousMetadata, File $file ) {
|
|
$this->normalizeMetadataTimestamps( $previousMetadata );
|
|
|
|
$descriptionText = $this->getDescriptionText( $file, $this->language );
|
|
|
|
$categories = $this->getCategories( $file, $previousMetadata );
|
|
$previousMetadata = array_merge( $previousMetadata,
|
|
$this->getCategoryMetadata( $categories ) );
|
|
|
|
$templateData = $this->templateParser->parsePage( $descriptionText );
|
|
$previousMetadata = array_merge( $previousMetadata,
|
|
$this->getTemplateMetadata( $templateData ) );
|
|
}
|
|
|
|
/**
|
|
* Checks for the presence of metadata needed for attributing the file (author, source, license)
|
|
* and returns a list of keys corresponding to problems.
|
|
* @param ParserOutput $parserOutput
|
|
* @param File $file
|
|
* @return array one or more of the following keys:
|
|
* - no-license - failed to detect a license
|
|
* - no-description - failed to detect any image description
|
|
* - no-author - failed to detect author name or a custom attribution text
|
|
* - no-source - failed to detect the source of the image or a custom attribution text
|
|
*/
|
|
public function verifyAttributionMetadata( ParserOutput $parserOutput, File $file ) {
|
|
// HTML code of the file description
|
|
if ( !$parserOutput->hasText() ) {
|
|
$descriptionText = '';
|
|
} else {
|
|
$descriptionText = $parserOutput->getText();
|
|
}
|
|
|
|
$templateData = $this->templateParser->parsePage( $descriptionText );
|
|
$problems = $licenseData = $informationData = [];
|
|
|
|
if ( isset( $templateData[TemplateParser::LICENSES_KEY] ) ) {
|
|
$licenseData = $this->selectLicense( $templateData[TemplateParser::LICENSES_KEY] );
|
|
}
|
|
if ( isset( $templateData[TemplateParser::INFORMATION_FIELDS_KEY] ) ) {
|
|
$informationData = $this->selectInformationTemplate(
|
|
$templateData[TemplateParser::INFORMATION_FIELDS_KEY] );
|
|
}
|
|
|
|
if ( !isset( $licenseData['LicenseShortName'] )
|
|
|| $licenseData['LicenseShortName'] === ''
|
|
) {
|
|
$problems[] = 'no-license';
|
|
}
|
|
if ( !isset( $informationData['ImageDescription'] )
|
|
|| $informationData['ImageDescription'] === ''
|
|
) {
|
|
$problems[] = 'no-description';
|
|
}
|
|
if (
|
|
( !isset( $informationData['Artist'] ) || $informationData['Artist'] === '' ) &&
|
|
( !isset( $informationData['Attribution'] ) || $informationData['Attribution'] === '' )
|
|
) {
|
|
$problems[] = 'no-author';
|
|
}
|
|
if (
|
|
( !isset( $informationData['Credit'] ) || $informationData['Credit'] === '' ) &&
|
|
( !isset( $informationData['Attribution'] ) || $informationData['Attribution'] === '' )
|
|
) {
|
|
$problems[] = 'no-source';
|
|
}
|
|
|
|
// Certain uploads (3D objects) need a patent license
|
|
$templates = $parserOutput->getTemplates();
|
|
$templates = $templates[NS_TEMPLATE] ?? [];
|
|
if (
|
|
!array_key_exists( '3dpatent', $templates ) &&
|
|
$file->getMimeType() === 'application/sla'
|
|
) {
|
|
$problems[] = 'no-patent';
|
|
}
|
|
|
|
return $problems;
|
|
}
|
|
|
|
/**
|
|
* @param array $categories
|
|
* @return array
|
|
*/
|
|
protected function getCategoryMetadata( array $categories ) {
|
|
$assessments = $this->getAssessmentsAndRemoveFromCategories( $categories );
|
|
$licenses = $this->getLicensesAndRemoveFromCategories( $categories );
|
|
|
|
return [
|
|
'Categories' => [
|
|
'value' => implode( '|', $categories ),
|
|
'source' => 'commons-categories',
|
|
],
|
|
'Assessments' => [
|
|
'value' => implode( '|', $assessments ),
|
|
'source' => 'commons-categories',
|
|
],
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @param array $templateData
|
|
* @return array
|
|
*/
|
|
protected function getTemplateMetadata( $templateData ) {
|
|
// GetExtendedMetadata does not handle multivalued fields,
|
|
// we need to select one of everything
|
|
$templateFields = [];
|
|
|
|
if ( isset( $templateData[TemplateParser::INFORMATION_FIELDS_KEY] ) ) {
|
|
$templateFields = array_merge( $templateFields, $this->selectInformationTemplate(
|
|
$templateData[TemplateParser::INFORMATION_FIELDS_KEY] ) );
|
|
}
|
|
|
|
if ( isset( $templateData[TemplateParser::LICENSES_KEY] ) ) {
|
|
$templateFields = array_merge( $templateFields,
|
|
$this->selectLicense( $templateData[TemplateParser::LICENSES_KEY] ) );
|
|
}
|
|
|
|
if ( isset( $templateData[TemplateParser::DELETION_KEY] ) ) {
|
|
$templateFields = array_merge( $templateFields,
|
|
$this->selectFirst( $templateData[TemplateParser::DELETION_KEY] ) );
|
|
}
|
|
|
|
if ( isset( $templateData[TemplateParser::RESTRICTIONS_KEY] ) ) {
|
|
$templateFields = array_merge( $templateFields,
|
|
$this->selectFirst( $templateData[TemplateParser::RESTRICTIONS_KEY] ) );
|
|
}
|
|
|
|
$metadata = [];
|
|
foreach ( $templateFields as $name => $value ) {
|
|
$metadata[ $name ] = [
|
|
'value' => $value,
|
|
'source' => 'commons-desc-page'
|
|
];
|
|
}
|
|
|
|
// use short name to generate internal name used in i18n
|
|
if ( isset( $templateFields['LicenseShortName'] ) ) {
|
|
$licenseData = $this->licenseParser->parseLicenseString(
|
|
$templateFields['LicenseShortName'] );
|
|
if ( isset( $licenseData['name'] ) ) {
|
|
$metadata['License'] = [
|
|
'value' => $licenseData['name'],
|
|
'source' => 'commons-templates',
|
|
];
|
|
}
|
|
}
|
|
|
|
return $metadata;
|
|
}
|
|
|
|
/**
|
|
* Gets the text of the file's description page.
|
|
* @param File $file
|
|
* @param Language $language
|
|
* @return string
|
|
*/
|
|
protected function getDescriptionText( File $file, Language $language ) {
|
|
# Note: If this is a local file, there is no caching here.
|
|
# However, the results of this module have longer caching for local
|
|
# files to help compensate. For foreign files, this method is cached
|
|
# via parser cache, and possibly a second cache depending on
|
|
# descriptionCacheExpiry (disabled on Wikimedia).
|
|
$text = $file->getDescriptionText( $language );
|
|
|
|
if ( get_class( $file ) == 'LocalFile' || get_class( $file ) == 'LocalFileMock' ) {
|
|
// LocalFile gets the text in a different way, and ends up with different output
|
|
// (specifically, relative instead of absolute URLs), so transform local URLs
|
|
// to absolute URLs after parse.
|
|
$text = ( new ParserOutput( $text ) )->getText( [ 'absoluteURLs' => true ] );
|
|
}
|
|
|
|
return $text;
|
|
}
|
|
|
|
/**
|
|
* @param File $file
|
|
* @param array $data metadata passed to the onGetExtendedMetadata hook
|
|
* @return string[] list of category names in human-readable format
|
|
*/
|
|
protected function getCategories( File $file, array $data ) {
|
|
$categories = [];
|
|
|
|
if ( is_a( $file, 'LocalFileMock' ) || is_a( $file, 'ForeignDBFileMock' ) ) {
|
|
// with all the hard-coded dependencies, mocking categoriy retrieval properly is
|
|
// pretty much impossible
|
|
return $file->mockedCategories;
|
|
} elseif ( $file instanceof LocalFile ) {
|
|
// for local or shared DB files (which are also LocalFile subclasses)
|
|
// categories can be queried directly from the database
|
|
|
|
$page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromTitle( $file->getOriginalTitle() );
|
|
if ( !$page instanceof WikiFilePage ) {
|
|
throw new InvalidArgumentException(
|
|
'Cannot instance WikiFilePage to get categories for ' . $file->getName()
|
|
. ', got instance of ' . get_class( $page )
|
|
);
|
|
}
|
|
$page->setFile( $file );
|
|
|
|
$categoryTitles = $page->getForeignCategories();
|
|
|
|
foreach ( $categoryTitles as $title ) {
|
|
$categories[] = $title->getText();
|
|
}
|
|
} elseif (
|
|
$file instanceof ForeignAPIFile
|
|
&& isset( $data['Categories'] )
|
|
) {
|
|
// getting categories for a ForeignAPIFile is not supported, but in case
|
|
// CommonsMetadata is installed on the remote repository as well, its output
|
|
// (including categories) is sent together with the extended file metadata,
|
|
// when the file is loaded. onGetExtendedMetadata hooks get that metadata
|
|
// when they are invoked.
|
|
$categories = explode( '|', $data['Categories']['value'] );
|
|
} else {
|
|
// out of luck - file is probably from a ForeignAPIRepo
|
|
// with CommonsMetadata not installed there
|
|
wfDebug( 'CommonsMetadata: cannot read category data' );
|
|
}
|
|
|
|
return $categories;
|
|
}
|
|
|
|
/**
|
|
* Matches category names to a category => license mapping, removes the matching categories
|
|
* and returns the corresponding licenses.
|
|
* @param array &$categories a list of human-readable category names.
|
|
* @return array
|
|
*/
|
|
protected function getLicensesAndRemoveFromCategories( &$categories ) {
|
|
$licenses = [];
|
|
foreach ( $categories as $i => $category ) {
|
|
$licenseData = $this->licenseParser->parseLicenseString( $category );
|
|
if ( $licenseData ) {
|
|
$licenses[] = $licenseData['name'];
|
|
unset( $categories[$i] );
|
|
}
|
|
}
|
|
$categories = array_merge( $categories ); // renumber to avoid holes in array
|
|
return $licenses;
|
|
}
|
|
|
|
/**
|
|
* Matches category names to a category => assessment mapping, removes the matching categories
|
|
* and returns the corresponding assessments (valued image, picture of the day etc).
|
|
* @param array &$categories a list of human-readable category names.
|
|
* @return array
|
|
*/
|
|
protected function getAssessmentsAndRemoveFromCategories( &$categories ) {
|
|
$assessments = [];
|
|
foreach ( $categories as $i => $category ) {
|
|
foreach ( self::$assessmentCategories as $assessmentType => $regexp ) {
|
|
if ( preg_match( $regexp . 'i', $category ) ) {
|
|
$assessments[] = $assessmentType;
|
|
unset( $categories[$i] );
|
|
}
|
|
}
|
|
}
|
|
$categories = array_merge( $categories ); // renumber to avoid holes in array
|
|
return array_unique( $assessments ); // potd/poty can happen multiple times
|
|
}
|
|
|
|
/**
|
|
* Receives a list of metadata arrays and selects the first one to use.
|
|
* @param array $arrays an array of arrays of metdata fields in fieldname => value form
|
|
* @return array an array of metadata fields in fieldname => value form
|
|
*/
|
|
protected function selectFirst( $arrays ) {
|
|
// multiple metadata values for the same fields on the same image would not make much sense,
|
|
// so use the first value
|
|
return $arrays ? $arrays[0] : [];
|
|
}
|
|
|
|
/**
|
|
* Receives the list of information templates found by the template parser and selects which one
|
|
* to use. Also collects all the authors to make sure attribution requirements are honored.
|
|
* @param array $informationTemplates an array of information templates,
|
|
* each is an array of metdata fields in fieldname => value form
|
|
* @return array an array of metdata fields in fieldname => value form
|
|
*/
|
|
protected function selectInformationTemplate( array $informationTemplates ) {
|
|
if ( !$informationTemplates ) {
|
|
return [];
|
|
}
|
|
|
|
$authorCount = 0;
|
|
foreach ( $informationTemplates as $template ) {
|
|
if ( isset( $template['Artist'] ) ) {
|
|
$authorCount++;
|
|
}
|
|
}
|
|
|
|
if ( $authorCount > 1 ) {
|
|
$informationTemplates[0]['AuthorCount'] = $authorCount;
|
|
}
|
|
return $informationTemplates[0];
|
|
}
|
|
|
|
/**
|
|
* Receives the list of licenses found by the template parser and selects which one to use.
|
|
* @param array $licenses an array of licenses, each is an array of metadata fields
|
|
* in fieldname => value form
|
|
* @return array an array of metadata fields in fieldname => value form
|
|
*/
|
|
protected function selectLicense( array $licenses ) {
|
|
if ( !$licenses ) {
|
|
return [];
|
|
}
|
|
|
|
$sortedLicenses = $this->licenseParser->sortDataByLicensePriority( $licenses,
|
|
static function ( $license ) {
|
|
if ( !isset( $license['LicenseShortName'] ) ) {
|
|
return null;
|
|
}
|
|
return $license['LicenseShortName'];
|
|
}
|
|
);
|
|
|
|
// sortDataByLicensePriority puts things in right order but also rearranges the keys
|
|
// - we don't want that
|
|
$sortedLicenses = array_values( $sortedLicenses );
|
|
|
|
if ( !$sortedLicenses ) {
|
|
return [];
|
|
}
|
|
|
|
// T131896 - if any license template is marked nonfree, the image is probably nonfree
|
|
foreach ( $sortedLicenses as $license ) {
|
|
if ( !empty( $license['NonFree'] ) ) {
|
|
$sortedLicenses[0]['NonFree'] = $license['NonFree'];
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $sortedLicenses[0];
|
|
}
|
|
|
|
/**
|
|
* Normalizes the metadata to wfTimestamp()'s TS_DB format
|
|
* @param array &$metadata
|
|
*/
|
|
protected function normalizeMetadataTimestamps( array &$metadata ) {
|
|
$fieldsToNormalize = [ 'DateTime', 'DateTimeOriginal' ];
|
|
foreach ( $fieldsToNormalize as $field ) {
|
|
if (
|
|
isset( $metadata[$field] ) &&
|
|
isset( $metadata[$field]['value'] ) &&
|
|
// Multilang values can get down here, which are arrays with
|
|
// '_type' => 'lang'. We don't want to pass an array to
|
|
// wfTimestamp: it won't work and will annoy PHP.
|
|
// @phan-suppress-next-line PhanTypeArraySuspicious
|
|
!isset( $metadata[$field]['value']['_type'] )
|
|
) {
|
|
$parsedTs = wfTimestamp( TS_DB, $metadata[$field]['value'] );
|
|
if ( $parsedTs ) {
|
|
$metadata[$field]['value'] = $parsedTs;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|