인문지식 처리와 프로그래밍2020 4.16
soook
CRAWLing
공공의 이익을 위해서가 아니면 안된다!!!
믄화재
#!/usr/bin/python
import sys
import os
import urllib.request
import shutil
address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno=" #문화재청 국가문화유산포털문화재
def downImage( document, sequence, urlstring ):
checkline = urlstring.upper()
begin = checkline.find("HTTP:")
end = checkline.find(".JPG")+4
url = urlstring[begin:end]
filename=document[:-4]
suffix = "{:03}.jpg".format(sequence)
path = "{0}-{1}".format(filename, suffix)
try:
urllib.request.urlretrieve(url, path)
except:
shutil.copy2('noimage.jpg', path)
def findImage( path ):
g = open( path, 'rt', encoding='UTF8')
found = 0
while 1:
line = g.readline()
if not line: break
checkline = line.upper()
if( checkline.find(".JPG") != -1 ):
whatIfound = line.strip()
downImage( path, found, whatIfound )
# found += 1
return
if found == 0:
print( "{0}: No Image".format(path))
path = "{0}-000.jpg".format(path[:-4])
shutil.copy2('noimage.jpg', path)
g.close()
def downHtml( folder, file ):
url = address.__add__(file)
path = folder + '/' + file + '.htm'
try:
urllib.request.urlretrieve(url, path)
except:
print( "{0}: Invalid Cpno!".format( file ) )
return
findImage( path )
def main():
try:
filename = sys.argv[1]
except:
return
list = filename.__add__( ".lst" )
folder = filename
try:
os.makedirs( folder )
except OSError:
pass
i = 0
f = open( list )
while 1:
line = f.readline()
if not line: break
file = line.strip()
print( "{0}: Processing....".format(file))
downHtml( folder, file )
f.close()
main()
민족문화대백과사전
#!/usr/bin/python
import sys
import os
import urllib.request
import shutil
address1 = "http://encykorea.aks.ac.kr/Contents/Item/" #민족문화대백과사전 기사 Url
address2 = "http://encykorea.aks.ac.kr/Contents/GetImage?id=" #민족문화대백과사전 이미지 Url
def downImage( document, sequence, urlstring ):
url = address2.__add__(urlstring)
filename=document[:-4]
suffix = "{:03}.jpg".format(sequence)
path = "{0}-{1}".format(filename, suffix)
try:
urllib.request.urlretrieve(url, path)
except:
shutil.copy2('noimage.jpg', path)
def findImage( path ):
#src="/Contents/GetImage?id=ceb01938-4ed4-4bd8-ac91-e884b7fc0460&w=260&h=260&fit=w&clip=1"
#multi_item = {mmid:'2a71cba8-eba7-4e1d-b4b0-30818c766013'
g = open( path, 'rt', encoding='UTF8')
found = 0
while 1:
line = g.readline()
if not line: break
a=line.find("/GetImage?id=")
if( a != -1 ):
b=line.find("&w=")
whatIfound=line[a+13:b]
downImage( path, found, whatIfound )
found += 1
return
if found == 0:
print( "{0}: No Image".format(path))
path = "{0}-000.jpg".format(path[:-4])
shutil.copy2('noimage.jpg', path)
g.close()
def downHtml( folder, file ):
url = address1.__add__(file)
path = folder + '/' + file + '.htm'
try:
urllib.request.urlretrieve(url, path)
except:
print( "{0}: Invalid ID!".format( file ) )
return
findImage( path )
def main():
try:
filename = sys.argv[1]
except:
return
list = filename.__add__( ".lst" )
folder = filename
try:
os.makedirs( folder )
except OSError:
pass
i = 0
f = open( list )
while 1:
line = f.readline()
if not line: break
file = line.strip()
print( "{0}: Processing....".format(file))
downHtml( folder, file )
f.close()
main()