ImportMediawiki.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. <?php
  2. /**
  3. * MediaWiki import plugin for phpMyAdmin
  4. */
  5. declare(strict_types=1);
  6. namespace PhpMyAdmin\Plugins\Import;
  7. use PhpMyAdmin\File;
  8. use PhpMyAdmin\Message;
  9. use PhpMyAdmin\Plugins\ImportPlugin;
  10. use PhpMyAdmin\Properties\Plugins\ImportPluginProperties;
  11. use function __;
  12. use function count;
  13. use function explode;
  14. use function mb_strlen;
  15. use function mb_strpos;
  16. use function mb_substr;
  17. use function preg_match;
  18. use function str_contains;
  19. use function str_replace;
  20. use function strcmp;
  21. use function strlen;
  22. use function trim;
  23. /**
  24. * Handles the import for the MediaWiki format
  25. */
  26. class ImportMediawiki extends ImportPlugin
  27. {
  28. /**
  29. * Whether to analyze tables
  30. *
  31. * @var bool
  32. */
  33. private $analyze;
  34. /**
  35. * @psalm-return non-empty-lowercase-string
  36. */
  37. public function getName(): string
  38. {
  39. return 'mediawiki';
  40. }
  41. protected function setProperties(): ImportPluginProperties
  42. {
  43. $this->setAnalyze(false);
  44. if ($GLOBALS['plugin_param'] !== 'table') {
  45. $this->setAnalyze(true);
  46. }
  47. $importPluginProperties = new ImportPluginProperties();
  48. $importPluginProperties->setText(__('MediaWiki Table'));
  49. $importPluginProperties->setExtension('txt');
  50. $importPluginProperties->setMimeType('text/plain');
  51. $importPluginProperties->setOptionsText(__('Options'));
  52. return $importPluginProperties;
  53. }
  54. /**
  55. * Handles the whole import logic
  56. *
  57. * @param array $sql_data 2-element array with sql data
  58. */
  59. public function doImport(?File $importHandle = null, array &$sql_data = []): void
  60. {
  61. global $error, $timeout_passed, $finished;
  62. // Defaults for parser
  63. // The buffer that will be used to store chunks read from the imported file
  64. $buffer = '';
  65. // Used as storage for the last part of the current chunk data
  66. // Will be appended to the first line of the next chunk, if there is one
  67. $last_chunk_line = '';
  68. // Remembers whether the current buffer line is part of a comment
  69. $inside_comment = false;
  70. // Remembers whether the current buffer line is part of a data comment
  71. $inside_data_comment = false;
  72. // Remembers whether the current buffer line is part of a structure comment
  73. $inside_structure_comment = false;
  74. // MediaWiki only accepts "\n" as row terminator
  75. $mediawiki_new_line = "\n";
  76. // Initialize the name of the current table
  77. $cur_table_name = '';
  78. $cur_temp_table_headers = [];
  79. $cur_temp_table = [];
  80. $in_table_header = false;
  81. while (! $finished && ! $error && ! $timeout_passed) {
  82. $data = $this->import->getNextChunk($importHandle);
  83. if ($data === false) {
  84. // Subtract data we didn't handle yet and stop processing
  85. $GLOBALS['offset'] -= mb_strlen($buffer);
  86. break;
  87. }
  88. if ($data !== true) {
  89. // Append new data to buffer
  90. $buffer = $data;
  91. unset($data);
  92. // Don't parse string if we're not at the end
  93. // and don't have a new line inside
  94. if (! str_contains($buffer, $mediawiki_new_line)) {
  95. continue;
  96. }
  97. }
  98. // Because of reading chunk by chunk, the first line from the buffer
  99. // contains only a portion of an actual line from the imported file.
  100. // Therefore, we have to append it to the last line from the previous
  101. // chunk. If we are at the first chunk, $last_chunk_line should be empty.
  102. $buffer = $last_chunk_line . $buffer;
  103. // Process the buffer line by line
  104. $buffer_lines = explode($mediawiki_new_line, $buffer);
  105. $full_buffer_lines_count = count($buffer_lines);
  106. // If the reading is not finalized, the final line of the current chunk
  107. // will not be complete
  108. if (! $finished) {
  109. $last_chunk_line = $buffer_lines[--$full_buffer_lines_count];
  110. }
  111. for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++$line_nr) {
  112. $cur_buffer_line = trim($buffer_lines[$line_nr]);
  113. // If the line is empty, go to the next one
  114. if ($cur_buffer_line === '') {
  115. continue;
  116. }
  117. $first_character = $cur_buffer_line[0];
  118. $matches = [];
  119. // Check beginning of comment
  120. if (! strcmp(mb_substr($cur_buffer_line, 0, 4), '<!--')) {
  121. $inside_comment = true;
  122. continue;
  123. }
  124. if ($inside_comment) {
  125. // Check end of comment
  126. if (! strcmp(mb_substr($cur_buffer_line, 0, 4), '-->')) {
  127. // Only data comments are closed. The structure comments
  128. // will be closed when a data comment begins (in order to
  129. // skip structure tables)
  130. if ($inside_data_comment) {
  131. $inside_data_comment = false;
  132. }
  133. // End comments that are not related to table structure
  134. if (! $inside_structure_comment) {
  135. $inside_comment = false;
  136. }
  137. } else {
  138. // Check table name
  139. $match_table_name = [];
  140. if (preg_match('/^Table data for `(.*)`$/', $cur_buffer_line, $match_table_name)) {
  141. $cur_table_name = $match_table_name[1];
  142. $inside_data_comment = true;
  143. $inside_structure_comment = $this->mngInsideStructComm($inside_structure_comment);
  144. } elseif (preg_match('/^Table structure for `(.*)`$/', $cur_buffer_line, $match_table_name)) {
  145. // The structure comments will be ignored
  146. $inside_structure_comment = true;
  147. }
  148. }
  149. continue;
  150. }
  151. if (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) {
  152. // Check start of table
  153. // This will store all the column info on all rows from
  154. // the current table read from the buffer
  155. $cur_temp_table = [];
  156. // Will be used as storage for the current row in the buffer
  157. // Once all its columns are read, it will be added to
  158. // $cur_temp_table and then it will be emptied
  159. $cur_temp_line = [];
  160. // Helps us differentiate the header columns
  161. // from the normal columns
  162. $in_table_header = false;
  163. // End processing because the current line does not
  164. // contain any column information
  165. } elseif (
  166. mb_substr($cur_buffer_line, 0, 2) === '|-'
  167. || mb_substr($cur_buffer_line, 0, 2) === '|+'
  168. || mb_substr($cur_buffer_line, 0, 2) === '|}'
  169. ) {
  170. // Check begin row or end table
  171. // Add current line to the values storage
  172. if (! empty($cur_temp_line)) {
  173. // If the current line contains header cells
  174. // ( marked with '!' ),
  175. // it will be marked as table header
  176. if ($in_table_header) {
  177. // Set the header columns
  178. $cur_temp_table_headers = $cur_temp_line;
  179. } else {
  180. // Normal line, add it to the table
  181. $cur_temp_table[] = $cur_temp_line;
  182. }
  183. }
  184. // Empty the temporary buffer
  185. $cur_temp_line = [];
  186. // No more processing required at the end of the table
  187. if (mb_substr($cur_buffer_line, 0, 2) === '|}') {
  188. $current_table = [
  189. $cur_table_name,
  190. $cur_temp_table_headers,
  191. $cur_temp_table,
  192. ];
  193. // Import the current table data into the database
  194. $this->importDataOneTable($current_table, $sql_data);
  195. // Reset table name
  196. $cur_table_name = '';
  197. }
  198. // What's after the row tag is now only attributes
  199. } elseif (($first_character === '|') || ($first_character === '!')) {
  200. // Check cell elements
  201. // Header cells
  202. if ($first_character === '!') {
  203. // Mark as table header, but treat as normal row
  204. $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line);
  205. // Will be used to set $cur_temp_line as table header
  206. $in_table_header = true;
  207. } else {
  208. $in_table_header = false;
  209. }
  210. // Loop through each table cell
  211. $cells = $this->explodeMarkup($cur_buffer_line);
  212. foreach ($cells as $cell) {
  213. $cell = $this->getCellData($cell);
  214. // Delete the beginning of the column, if there is one
  215. $cell = trim($cell);
  216. $col_start_chars = [
  217. '|',
  218. '!',
  219. ];
  220. foreach ($col_start_chars as $col_start_char) {
  221. $cell = $this->getCellContent($cell, $col_start_char);
  222. }
  223. // Add the cell to the row
  224. $cur_temp_line[] = $cell;
  225. }
  226. } else {
  227. // If it's none of the above, then the current line has a bad
  228. // format
  229. $message = Message::error(
  230. __('Invalid format of mediawiki input on line: <br>%s.')
  231. );
  232. $message->addParam($cur_buffer_line);
  233. $error = true;
  234. }
  235. }
  236. }
  237. }
  238. /**
  239. * Imports data from a single table
  240. *
  241. * @param array $table containing all table info:
  242. * <code> $table[0] - string
  243. * containing table name
  244. * $table[1] - array[] of
  245. * table headers $table[2] -
  246. * array[][] of table content
  247. * rows </code>
  248. * @param array $sql_data 2-element array with sql data
  249. *
  250. * @global bool $analyze whether to scan for column types
  251. */
  252. private function importDataOneTable(array $table, array &$sql_data): void
  253. {
  254. $analyze = $this->getAnalyze();
  255. if ($analyze) {
  256. // Set the table name
  257. $this->setTableName($table[0]);
  258. // Set generic names for table headers if they don't exist and the table has some data
  259. if ($table[2] !== []) {
  260. $this->setTableHeaders($table[1], $table[2][0]);
  261. }
  262. // Create the tables array to be used in Import::buildSql()
  263. $tables = [];
  264. $tables[] = [
  265. $table[0],
  266. $table[1],
  267. $table[2],
  268. ];
  269. // Obtain the best-fit MySQL types for each column
  270. $analyses = [];
  271. $analyses[] = $this->import->analyzeTable($tables[0]);
  272. $this->executeImportTables($tables, $analyses, $sql_data);
  273. }
  274. // Commit any possible data in buffers
  275. $this->import->runQuery('', '', $sql_data);
  276. }
  277. /**
  278. * Sets the table name
  279. *
  280. * @param string $table_name reference to the name of the table
  281. */
  282. private function setTableName(&$table_name): void
  283. {
  284. global $dbi;
  285. if (! empty($table_name)) {
  286. return;
  287. }
  288. $result = $dbi->fetchResult('SHOW TABLES');
  289. // todo check if the name below already exists
  290. $table_name = 'TABLE ' . (count($result) + 1);
  291. }
  292. /**
  293. * Set generic names for table headers, if they don't exist
  294. *
  295. * @param array $table_headers reference to the array containing the headers
  296. * of a table
  297. * @param array $table_row array containing the first content row
  298. */
  299. private function setTableHeaders(array &$table_headers, array $table_row): void
  300. {
  301. if (! empty($table_headers)) {
  302. return;
  303. }
  304. // The first table row should contain the number of columns
  305. // If they are not set, generic names will be given (COL 1, COL 2, etc)
  306. $num_cols = count($table_row);
  307. for ($i = 0; $i < $num_cols; ++$i) {
  308. $table_headers[$i] = 'COL ' . ($i + 1);
  309. }
  310. }
  311. /**
  312. * Sets the database name and additional options and calls Import::buildSql()
  313. * Used in PMA_importDataAllTables() and $this->importDataOneTable()
  314. *
  315. * @param array $tables structure:
  316. * array(
  317. * array(table_name, array() column_names, array()()
  318. * rows)
  319. * )
  320. * @param array $analyses structure:
  321. * $analyses = array(
  322. * array(array() column_types, array() column_sizes)
  323. * )
  324. * @param array $sql_data 2-element array with sql data
  325. *
  326. * @global string $db name of the database to import in
  327. */
  328. private function executeImportTables(array &$tables, array &$analyses, array &$sql_data): void
  329. {
  330. global $db;
  331. // $db_name : The currently selected database name, if applicable
  332. // No backquotes
  333. // $options : An associative array of options
  334. [$db_name, $options] = $this->getDbnameAndOptions($db, 'mediawiki_DB');
  335. // Array of SQL strings
  336. // Non-applicable parameters
  337. $create = null;
  338. // Create and execute necessary SQL statements from data
  339. $this->import->buildSql($db_name, $tables, $analyses, $create, $options, $sql_data);
  340. }
  341. /**
  342. * Replaces all instances of the '||' separator between delimiters
  343. * in a given string
  344. *
  345. * @param string $replace the string to be replaced with
  346. * @param string $subject the text to be replaced
  347. *
  348. * @return string with replacements
  349. */
  350. private function delimiterReplace($replace, $subject)
  351. {
  352. // String that will be returned
  353. $cleaned = '';
  354. // Possible states of current character
  355. $inside_tag = false;
  356. $inside_attribute = false;
  357. // Attributes can be declared with either " or '
  358. $start_attribute_character = false;
  359. // The full separator is "||";
  360. // This remembers if the previous character was '|'
  361. $partial_separator = false;
  362. // Parse text char by char
  363. for ($i = 0, $iMax = strlen($subject); $i < $iMax; $i++) {
  364. $cur_char = $subject[$i];
  365. // Check for separators
  366. if ($cur_char === '|') {
  367. // If we're not inside a tag, then this is part of a real separator,
  368. // so we append it to the current segment
  369. if (! $inside_attribute) {
  370. $cleaned .= $cur_char;
  371. if ($partial_separator) {
  372. $inside_tag = false;
  373. $inside_attribute = false;
  374. }
  375. } elseif ($partial_separator) {
  376. // If we are inside a tag, we replace the current char with
  377. // the placeholder and append that to the current segment
  378. $cleaned .= $replace;
  379. }
  380. // If the previous character was also '|', then this ends a
  381. // full separator. If not, this may be the beginning of one
  382. $partial_separator = ! $partial_separator;
  383. } else {
  384. // If we're inside a tag attribute and the current character is
  385. // not '|', but the previous one was, it means that the single '|'
  386. // was not appended, so we append it now
  387. if ($partial_separator && $inside_attribute) {
  388. $cleaned .= '|';
  389. }
  390. // If the char is different from "|", no separator can be formed
  391. $partial_separator = false;
  392. // any other character should be appended to the current segment
  393. $cleaned .= $cur_char;
  394. if ($cur_char === '<' && ! $inside_attribute) {
  395. // start of a tag
  396. $inside_tag = true;
  397. } elseif ($cur_char === '>' && ! $inside_attribute) {
  398. // end of a tag
  399. $inside_tag = false;
  400. } elseif (($cur_char === '"' || $cur_char == "'") && $inside_tag) {
  401. // start or end of an attribute
  402. if (! $inside_attribute) {
  403. $inside_attribute = true;
  404. // remember the attribute`s declaration character (" or ')
  405. $start_attribute_character = $cur_char;
  406. } else {
  407. if ($cur_char == $start_attribute_character) {
  408. $inside_attribute = false;
  409. // unset attribute declaration character
  410. $start_attribute_character = false;
  411. }
  412. }
  413. }
  414. }
  415. }
  416. return $cleaned;
  417. }
  418. /**
  419. * Separates a string into items, similarly to explode
  420. * Uses the '||' separator (which is standard in the mediawiki format)
  421. * and ignores any instances of it inside markup tags
  422. * Used in parsing buffer lines containing data cells
  423. *
  424. * @param string $text text to be split
  425. *
  426. * @return array
  427. */
  428. private function explodeMarkup($text)
  429. {
  430. $separator = '||';
  431. $placeholder = "\x00";
  432. // Remove placeholder instances
  433. $text = str_replace($placeholder, '', $text);
  434. // Replace instances of the separator inside HTML-like
  435. // tags with the placeholder
  436. $cleaned = $this->delimiterReplace($placeholder, $text);
  437. // Explode, then put the replaced separators back in
  438. $items = explode($separator, $cleaned);
  439. foreach ($items as $i => $str) {
  440. $items[$i] = str_replace($placeholder, $separator, $str);
  441. }
  442. return $items;
  443. }
  444. /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */
  445. /**
  446. * Returns true if the table should be analyzed, false otherwise
  447. */
  448. private function getAnalyze(): bool
  449. {
  450. return $this->analyze;
  451. }
  452. /**
  453. * Sets to true if the table should be analyzed, false otherwise
  454. *
  455. * @param bool $analyze status
  456. */
  457. private function setAnalyze($analyze): void
  458. {
  459. $this->analyze = $analyze;
  460. }
  461. /**
  462. * Get cell
  463. *
  464. * @param string $cell Cell
  465. *
  466. * @return mixed
  467. */
  468. private function getCellData($cell)
  469. {
  470. // A cell could contain both parameters and data
  471. $cell_data = explode('|', $cell, 2);
  472. // A '|' inside an invalid link should not
  473. // be mistaken as delimiting cell parameters
  474. if (! str_contains($cell_data[0], '[[')) {
  475. return $cell;
  476. }
  477. if (count($cell_data) === 1) {
  478. return $cell_data[0];
  479. }
  480. return $cell_data[1];
  481. }
  482. /**
  483. * Manage $inside_structure_comment
  484. *
  485. * @param bool $inside_structure_comment Value to test
  486. */
  487. private function mngInsideStructComm($inside_structure_comment): bool
  488. {
  489. // End ignoring structure rows
  490. if ($inside_structure_comment) {
  491. $inside_structure_comment = false;
  492. }
  493. return $inside_structure_comment;
  494. }
  495. /**
  496. * Get cell content
  497. *
  498. * @param string $cell Cell
  499. * @param string $col_start_char Start char
  500. *
  501. * @return string
  502. */
  503. private function getCellContent($cell, $col_start_char)
  504. {
  505. if (mb_strpos($cell, $col_start_char) === 0) {
  506. $cell = trim(mb_substr($cell, 1));
  507. }
  508. return $cell;
  509. }
  510. }