"인문지식 처리와 프로그래밍2020 4.16"의 두 판 사이의 차이
soook
(새 문서: <pre> #!/usr/bin/python import sys import os import urllib.request import shutil address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno=" #문화재청 국가...) |
(→CRAWLing) |
||
| (같은 사용자의 중간 판 5개는 보이지 않습니다) | |||
| 1번째 줄: | 1번째 줄: | ||
| + | =CRAWLing= | ||
| + | 공공의 이익을 위해서가 아니면 안된다!!! | ||
| + | |||
| + | *[https://www.data.go.kr/ 공공데이터 포털] | ||
| + | |||
| + | ==믄화재== | ||
<pre> | <pre> | ||
| 58번째 줄: | 64번째 줄: | ||
except: | except: | ||
print( "{0}: Invalid Cpno!".format( file ) ) | print( "{0}: Invalid Cpno!".format( file ) ) | ||
| + | return | ||
| + | |||
| + | findImage( path ) | ||
| + | |||
| + | def main(): | ||
| + | |||
| + | try: | ||
| + | filename = sys.argv[1] | ||
| + | except: | ||
| + | return | ||
| + | |||
| + | list = filename.__add__( ".lst" ) | ||
| + | folder = filename | ||
| + | |||
| + | try: | ||
| + | os.makedirs( folder ) | ||
| + | except OSError: | ||
| + | pass | ||
| + | |||
| + | i = 0 | ||
| + | f = open( list ) | ||
| + | |||
| + | while 1: | ||
| + | line = f.readline() | ||
| + | if not line: break | ||
| + | file = line.strip() | ||
| + | print( "{0}: Processing....".format(file)) | ||
| + | downHtml( folder, file ) | ||
| + | |||
| + | f.close() | ||
| + | |||
| + | main() | ||
| + | |||
| + | |||
| + | </pre> | ||
| + | |||
| + | ==민족문화대백과사전== | ||
| + | |||
| + | <pre> | ||
| + | #!/usr/bin/python | ||
| + | |||
| + | import sys | ||
| + | import os | ||
| + | import urllib.request | ||
| + | import shutil | ||
| + | |||
| + | address1 = "http://encykorea.aks.ac.kr/Contents/Item/" #민족문화대백과사전 기사 Url | ||
| + | address2 = "http://encykorea.aks.ac.kr/Contents/GetImage?id=" #민족문화대백과사전 이미지 Url | ||
| + | |||
| + | def downImage( document, sequence, urlstring ): | ||
| + | |||
| + | url = address2.__add__(urlstring) | ||
| + | filename=document[:-4] | ||
| + | suffix = "{:03}.jpg".format(sequence) | ||
| + | path = "{0}-{1}".format(filename, suffix) | ||
| + | |||
| + | try: | ||
| + | urllib.request.urlretrieve(url, path) | ||
| + | except: | ||
| + | shutil.copy2('noimage.jpg', path) | ||
| + | |||
| + | |||
| + | def findImage( path ): | ||
| + | |||
| + | #src="/Contents/GetImage?id=ceb01938-4ed4-4bd8-ac91-e884b7fc0460&w=260&h=260&fit=w&clip=1" | ||
| + | #multi_item = {mmid:'2a71cba8-eba7-4e1d-b4b0-30818c766013' | ||
| + | |||
| + | g = open( path, 'rt', encoding='UTF8') | ||
| + | |||
| + | found = 0 | ||
| + | |||
| + | while 1: | ||
| + | line = g.readline() | ||
| + | if not line: break | ||
| + | |||
| + | a=line.find("/GetImage?id=") | ||
| + | if( a != -1 ): | ||
| + | b=line.find("&w=") | ||
| + | whatIfound=line[a+13:b] | ||
| + | downImage( path, found, whatIfound ) | ||
| + | found += 1 | ||
| + | return | ||
| + | |||
| + | if found == 0: | ||
| + | print( "{0}: No Image".format(path)) | ||
| + | path = "{0}-000.jpg".format(path[:-4]) | ||
| + | shutil.copy2('noimage.jpg', path) | ||
| + | |||
| + | g.close() | ||
| + | |||
| + | def downHtml( folder, file ): | ||
| + | |||
| + | url = address1.__add__(file) | ||
| + | path = folder + '/' + file + '.htm' | ||
| + | |||
| + | try: | ||
| + | urllib.request.urlretrieve(url, path) | ||
| + | except: | ||
| + | print( "{0}: Invalid ID!".format( file ) ) | ||
return | return | ||
2020년 4월 22일 (수) 22:49 기준 최신판
CRAWLing
공공의 이익을 위해서가 아니면 안된다!!!
믄화재
#!/usr/bin/python
import sys
import os
import urllib.request
import shutil
address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno=" #문화재청 국가문화유산포털문화재
def downImage( document, sequence, urlstring ):
checkline = urlstring.upper()
begin = checkline.find("HTTP:")
end = checkline.find(".JPG")+4
url = urlstring[begin:end]
filename=document[:-4]
suffix = "{:03}.jpg".format(sequence)
path = "{0}-{1}".format(filename, suffix)
try:
urllib.request.urlretrieve(url, path)
except:
shutil.copy2('noimage.jpg', path)
def findImage( path ):
g = open( path, 'rt', encoding='UTF8')
found = 0
while 1:
line = g.readline()
if not line: break
checkline = line.upper()
if( checkline.find(".JPG") != -1 ):
whatIfound = line.strip()
downImage( path, found, whatIfound )
# found += 1
return
if found == 0:
print( "{0}: No Image".format(path))
path = "{0}-000.jpg".format(path[:-4])
shutil.copy2('noimage.jpg', path)
g.close()
def downHtml( folder, file ):
url = address.__add__(file)
path = folder + '/' + file + '.htm'
try:
urllib.request.urlretrieve(url, path)
except:
print( "{0}: Invalid Cpno!".format( file ) )
return
findImage( path )
def main():
try:
filename = sys.argv[1]
except:
return
list = filename.__add__( ".lst" )
folder = filename
try:
os.makedirs( folder )
except OSError:
pass
i = 0
f = open( list )
while 1:
line = f.readline()
if not line: break
file = line.strip()
print( "{0}: Processing....".format(file))
downHtml( folder, file )
f.close()
main()
민족문화대백과사전
#!/usr/bin/python
import sys
import os
import urllib.request
import shutil
address1 = "http://encykorea.aks.ac.kr/Contents/Item/" #민족문화대백과사전 기사 Url
address2 = "http://encykorea.aks.ac.kr/Contents/GetImage?id=" #민족문화대백과사전 이미지 Url
def downImage( document, sequence, urlstring ):
url = address2.__add__(urlstring)
filename=document[:-4]
suffix = "{:03}.jpg".format(sequence)
path = "{0}-{1}".format(filename, suffix)
try:
urllib.request.urlretrieve(url, path)
except:
shutil.copy2('noimage.jpg', path)
def findImage( path ):
#src="/Contents/GetImage?id=ceb01938-4ed4-4bd8-ac91-e884b7fc0460&w=260&h=260&fit=w&clip=1"
#multi_item = {mmid:'2a71cba8-eba7-4e1d-b4b0-30818c766013'
g = open( path, 'rt', encoding='UTF8')
found = 0
while 1:
line = g.readline()
if not line: break
a=line.find("/GetImage?id=")
if( a != -1 ):
b=line.find("&w=")
whatIfound=line[a+13:b]
downImage( path, found, whatIfound )
found += 1
return
if found == 0:
print( "{0}: No Image".format(path))
path = "{0}-000.jpg".format(path[:-4])
shutil.copy2('noimage.jpg', path)
g.close()
def downHtml( folder, file ):
url = address1.__add__(file)
path = folder + '/' + file + '.htm'
try:
urllib.request.urlretrieve(url, path)
except:
print( "{0}: Invalid ID!".format( file ) )
return
findImage( path )
def main():
try:
filename = sys.argv[1]
except:
return
list = filename.__add__( ".lst" )
folder = filename
try:
os.makedirs( folder )
except OSError:
pass
i = 0
f = open( list )
while 1:
line = f.readline()
if not line: break
file = line.strip()
print( "{0}: Processing....".format(file))
downHtml( folder, file )
f.close()
main()