if you download your wikipedia xml file then you should have a unix style linedelimiter linefeed, ASCII 10.
In that case you might want to try this code:
Code: Select all
on mouseUp
put empty into field "ListLog"
put empty into field "LabelStatusCount"
put "Started: " & the short system date & " - " & the long system time after field "ListLog"
set the enabled of button "Generate Indexing" to false
put "/Users/berndnig/Desktop/WikipediaIndex3.dat" into varIndexFile
put "/Users/berndnig/Desktop/dewiki-latest-pages-articles.xml" into varFileToRead
open file varFileToRead for binary read
open file varIndexFile for write
put 0 into varCounter
put 0 into varCharLoc
put 0 into varTally
put 0 into tLabelCount
put false into tReachedTheEnd
put the milliseconds into tStart
repeat until tReachedTheEnd
read from file varFileToRead for 10000000
if the result is "eof" then
put true into tReachedTheEnd
end if
put it into tPartText
-- the read could have ended before a line end, lets go on and read until the line is complete
-- this way we should always have complete lines including </title>
if not tReachedTheEnd then
read from file varFileToRead for 1 line
put it after tPartText
end if
put 0 into tSkipChars
put 0 into tEndTitle
repeat
put offset("<Title>",tPartText, tSkipChars) into tFound
if tFound = 0 then
put varCharLoc + (the number of chars of char (tSkipChars+1) to - 1 of tPartText) into varCharLoc
exit repeat
end if
put varCharLoc + tFound into varCharLoc
put tSkipChars + tFound into tSkipChars
put offset("</Title>",tPartText,tSkipChars) into tEndTitle
put char tSkipChars to tSkipChars + tEndTitle - 1 of tPartText & "|" & varCharLoc & return after varIndexData
put varTally + 1 into varTally
if varTally is 1000 then
put tLabelCount + varTally into tLabelcount
put tLabelcount into field "LabelStatusCount"
wait 0 milliseconds with messages
put 0 into varTally
write varIndexData to file varIndexFile at eof
put empty into varIndexData
end if
end repeat
--put varCharLoc - 1 into varCharLoc
end repeat
-- put the milliseconds- tStart into field "myTime"
set the enabled of button "Generate Indexing" to true
set the enabled of field "ListLog" to true
put cr & "Complete: " & the short system date & " - " & the long system time after field "ListLog"
put field "LabelStatusCount" + varTally into field "LabelStatusCount"
write varIndexData to file varIndexFile at eof
close file varFileToRead
close file varIndexFile
answer "Wikipedia data indexed and ready for use."
end mouseUp
regards
Bernd