Provided by: libmediawiki-dumpfile-perl_0.2.2-2_all 
      
    
NAME
       Parse::MediaWikiDump::Pages - Object capable of processing dump files with a single revision per article
ABOUT
       This object is used to access the metadata associated with a MediaWiki instance and provide an iterative
       interface for extracting the individual articles out of the same. This module does not allow more than
       one revision for each specific article; to parse a comprehensive dump file use the
       Parse::MediaWikiDump::Revisions object.
SYNOPSIS
         use MediaWiki::DumpFile::Compat;
         $pmwd = Parse::MediaWikiDump->new;
         $input = 'pages-articles.xml';
         $input = \*FILEHANDLE;
         $pages = $pmwd->pages();
         $pages = $pmwd->pages();
         $pages = $pmwd->pages(input => $input, fast_mode => 0);
         #print the title and id of each article inside the dump file
         while(defined($page = $pages->next)) {
           print "title '", $page->title, "' id ", $page->id, "\n";
         }
METHODS
       $pages->new
           Open  the specified MediaWiki dump file. If the single argument to this method is a string it will be
           used as the path to the file to open. If the argument is a reference to  a  filehandle  the  contents
           will be read from the filehandle as specified.
           If  more  than  one  argument  is supplied the arguments must be a hash of configuration options. The
           input option is required and is the same as previously described. The fast_mode option  is  optional,
           defaults to being off, and if set to a true value will cause the parser to run in a mode that is much
           faster   but   only   provides   access   to  the  title  and  text  contents  of  a  page.  See  the
           MediaWiki::DumpFile::Pages for details about fast mode.
       $pages->next
           Returns an instance of the next available Parse::MediaWikiDump::page object or returns undef if there
           are no more articles left.
       $pages->version
           Returns a plain text string of the dump file format revision number
       $pages->sitename
           Returns a plain text string that is the name of the MediaWiki instance.
       $pages->base
           Returns the URL to the instances main article in the form of a string.
       $pages->generator
           Returns a string containing 'MediaWiki' and a version number of the instance that dumped  this  file.
           Example: 'MediaWiki 1.14alpha'
       $pages->case
           Returns a string describing the case sensitivity configured in the instance.
       $pages->namespaces
           Returns a reference to an array of references. Each reference is to another array with the first item
           being  the  unique identifier of the namespace and the second element containing a string that is the
           name of the namespace.
       $pages->namespaces_names
           Returns an array reference the array contains strings of all the namespaces each as an element.
       $pages->current_byte
           Returns the number of bytes that has been processed so far
       $pages->size
           Returns the total size of the dump file in bytes.
   Scan an article dump file for double redirects that exist in the most recent article revision
         #!/usr/bin/perl
         #progress information goes to STDERR, a list of double redirects found
         #goes to STDOUT
         binmode(STDOUT, ":utf8");
         binmode(STDERR, ":utf8");
         use strict;
         use warnings;
         use MediaWiki::DumpFile::Compat;
         my $file = shift(@ARGV);
         my $pmwd = Parse::MediaWikiDump->new;
         my $pages;
         my $page;
         my %redirs;
         my $artcount = 0;
         my $file_size;
         my $start = time;
         if (defined($file)) {
               $file_size = (stat($file))[7];
               $pages = $pmwd->pages($file);
         } else {
               print STDERR "No file specified, using standard input\n";
               $pages = $pmwd->pages(\*STDIN);
         }
         #the case of the first letter of titles is ignored - force this option
         #because the other values of the case setting are unknown
         die 'this program only supports the first-letter case setting' unless
               $pages->case eq 'first-letter';
         print STDERR "Analyzing articles:\n";
         while(defined($page = $pages->next)) {
           update_ui() if ++$artcount % 500 == 0;
           #main namespace only
           next unless $page->namespace eq '';
           next unless defined($page->redirect);
           my $title = case_fixer($page->title);
           #create a list of redirects indexed by their original name
           $redirs{$title} = case_fixer($page->redirect);
         }
         my $redir_count = scalar(keys(%redirs));
         print STDERR "done; searching $redir_count redirects:\n";
         my $count = 0;
         #if a redirect location is also a key to the index we have a double redirect
         foreach my $key (keys(%redirs)) {
           my $redirect = $redirs{$key};
           if (defined($redirs{$redirect})) {
             print "$key\n";
             $count++;
           }
         }
         print STDERR "discovered $count double redirects\n";
         #removes any case sensativity from the very first letter of the title
         #but not from the optional namespace name
         sub case_fixer {
           my $title = shift;
           #check for namespace
           if ($title =~ /^(.+?):(.+)/) {
             $title = $1 . ':' . ucfirst($2);
           } else {
             $title = ucfirst($title);
           }
           return $title;
         }
         sub pretty_bytes {
           my $bytes = shift;
           my $pretty = int($bytes) . ' bytes';
           if (($bytes = $bytes / 1024) > 1) {
             $pretty = int($bytes) . ' kilobytes';
           }
           if (($bytes = $bytes / 1024) > 1) {
             $pretty = sprintf("%0.2f", $bytes) . ' megabytes';
           }
           if (($bytes = $bytes / 1024) > 1) {
             $pretty = sprintf("%0.4f", $bytes) . ' gigabytes';
           }
           return $pretty;
         }
         sub pretty_number {
           my $number = reverse(shift);
           $number =~ s/(...)/$1,/g;
           $number = reverse($number);
           $number =~ s/^,//;
           return $number;
         }
         sub update_ui {
           my $seconds = time - $start;
           my $bytes = $pages->current_byte;
           print STDERR "  ", pretty_number($artcount),  " articles; ";
           print STDERR pretty_bytes($bytes), " processed; ";
           if (defined($file_size)) {
             my $percent = int($bytes / $file_size * 100);
             print STDERR "$percent% completed\n";
           } else {
             my $bytes_per_second = int($bytes / $seconds);
             print STDERR pretty_bytes($bytes_per_second), " per second\n";
           }
         }
   Version 0.4
       This class was updated to support version 0.4 dump files from  a  MediaWiki  instance  but  it  does  not
       currently support any of the new information available in those files.
perl v5.34.0                                       2022-06-15             MediaWiki::Dump...::Compat::Pages(3pm)